update fix masking agent scraper
This commit is contained in:
parent
48ca276241
commit
c30561bc89
1 changed files with 52 additions and 4 deletions
|
|
@ -26,10 +26,58 @@ async def generate_pdf(link: str, name: str):
|
|||
|
||||
try:
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch()
|
||||
page = await browser.new_page()
|
||||
await page.goto(link, wait_until='networkidle')
|
||||
await page.pdf(path=pdf_path)
|
||||
# Launch browser with stealth options to bypass WAF
|
||||
browser = await p.chromium.launch(
|
||||
headless=True,
|
||||
args=[
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--disable-dev-shm-usage',
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
]
|
||||
)
|
||||
|
||||
# Create context with realistic browser fingerprint
|
||||
context = await browser.new_context(
|
||||
viewport={'width': 1920, 'height': 1080},
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||
locale='en-US',
|
||||
timezone_id='Europe/Rome',
|
||||
extra_http_headers={
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Accept-Language': 'en-US,en;q=0.9,it;q=0.8',
|
||||
'Cache-Control': 'max-age=0',
|
||||
'Sec-Ch-Ua': '"Chromium";v="131", "Not_A Brand";v="24"',
|
||||
'Sec-Ch-Ua-Mobile': '?0',
|
||||
'Sec-Ch-Ua-Platform': '"Windows"',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
}
|
||||
)
|
||||
|
||||
page = await context.new_page()
|
||||
|
||||
# Remove webdriver property to avoid detection
|
||||
await page.add_init_script("""
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
});
|
||||
""")
|
||||
|
||||
# Navigate to the page
|
||||
await page.goto(link, wait_until='networkidle', timeout=60000)
|
||||
|
||||
# Wait a bit to ensure everything is loaded
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# Generate PDF
|
||||
await page.pdf(path=pdf_path, format='A4', print_background=True)
|
||||
|
||||
await context.close()
|
||||
await browser.close()
|
||||
|
||||
if os.path.exists(pdf_path):
|
||||
|
|
|
|||
Loading…
Reference in a new issue