From c30561bc896e964b9d0c17c22cd5e892c7a0315e Mon Sep 17 00:00:00 2001 From: adish-rmr Date: Mon, 8 Dec 2025 12:23:27 +0100 Subject: [PATCH] update fix masking agent scraper --- src/pif_compiler/functions/common_func.py | 56 +++++++++++++++++++++-- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/src/pif_compiler/functions/common_func.py b/src/pif_compiler/functions/common_func.py index 8cb8683..0145e4e 100644 --- a/src/pif_compiler/functions/common_func.py +++ b/src/pif_compiler/functions/common_func.py @@ -26,10 +26,58 @@ async def generate_pdf(link: str, name: str): try: async with async_playwright() as p: - browser = await p.chromium.launch() - page = await browser.new_page() - await page.goto(link, wait_until='networkidle') - await page.pdf(path=pdf_path) + # Launch browser with stealth options to bypass WAF + browser = await p.chromium.launch( + headless=True, + args=[ + '--disable-blink-features=AutomationControlled', + '--disable-dev-shm-usage', + '--no-sandbox', + '--disable-setuid-sandbox', + ] + ) + + # Create context with realistic browser fingerprint + context = await browser.new_context( + viewport={'width': 1920, 'height': 1080}, + user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', + locale='en-US', + timezone_id='Europe/Rome', + extra_http_headers={ + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en-US,en;q=0.9,it;q=0.8', + 'Cache-Control': 'max-age=0', + 'Sec-Ch-Ua': '"Chromium";v="131", "Not_A Brand";v="24"', + 'Sec-Ch-Ua-Mobile': '?0', + 'Sec-Ch-Ua-Platform': '"Windows"', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-User': '?1', + 'Upgrade-Insecure-Requests': '1', + } + ) + + page = await context.new_page() + + # Remove webdriver property to avoid detection + await page.add_init_script(""" + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + """) + + # Navigate to the page + await page.goto(link, wait_until='networkidle', timeout=60000) + + # Wait a bit to ensure everything is loaded + await page.wait_for_timeout(2000) + + # Generate PDF + await page.pdf(path=pdf_path, format='A4', print_background=True) + + await context.close() await browser.close() if os.path.exists(pdf_path):