update fix masking agent scraper

This commit is contained in:
adish-rmr 2025-12-08 12:23:27 +01:00
parent 48ca276241
commit c30561bc89

View file

@ -26,10 +26,58 @@ async def generate_pdf(link: str, name: str):
try:
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
await page.goto(link, wait_until='networkidle')
await page.pdf(path=pdf_path)
# Launch browser with stealth options to bypass WAF
browser = await p.chromium.launch(
headless=True,
args=[
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--no-sandbox',
'--disable-setuid-sandbox',
]
)
# Create context with realistic browser fingerprint
context = await browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
locale='en-US',
timezone_id='Europe/Rome',
extra_http_headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,it;q=0.8',
'Cache-Control': 'max-age=0',
'Sec-Ch-Ua': '"Chromium";v="131", "Not_A Brand";v="24"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
}
)
page = await context.new_page()
# Remove webdriver property to avoid detection
await page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
""")
# Navigate to the page
await page.goto(link, wait_until='networkidle', timeout=60000)
# Wait a bit to ensure everything is loaded
await page.wait_for_timeout(2000)
# Generate PDF
await page.pdf(path=pdf_path, format='A4', print_background=True)
await context.close()
await browser.close()
if os.path.exists(pdf_path):