update fix masking agent scraper
This commit is contained in:
parent
48ca276241
commit
c30561bc89
1 changed files with 52 additions and 4 deletions
|
|
@ -26,10 +26,58 @@ async def generate_pdf(link: str, name: str):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async with async_playwright() as p:
|
async with async_playwright() as p:
|
||||||
browser = await p.chromium.launch()
|
# Launch browser with stealth options to bypass WAF
|
||||||
page = await browser.new_page()
|
browser = await p.chromium.launch(
|
||||||
await page.goto(link, wait_until='networkidle')
|
headless=True,
|
||||||
await page.pdf(path=pdf_path)
|
args=[
|
||||||
|
'--disable-blink-features=AutomationControlled',
|
||||||
|
'--disable-dev-shm-usage',
|
||||||
|
'--no-sandbox',
|
||||||
|
'--disable-setuid-sandbox',
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create context with realistic browser fingerprint
|
||||||
|
context = await browser.new_context(
|
||||||
|
viewport={'width': 1920, 'height': 1080},
|
||||||
|
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||||
|
locale='en-US',
|
||||||
|
timezone_id='Europe/Rome',
|
||||||
|
extra_http_headers={
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9,it;q=0.8',
|
||||||
|
'Cache-Control': 'max-age=0',
|
||||||
|
'Sec-Ch-Ua': '"Chromium";v="131", "Not_A Brand";v="24"',
|
||||||
|
'Sec-Ch-Ua-Mobile': '?0',
|
||||||
|
'Sec-Ch-Ua-Platform': '"Windows"',
|
||||||
|
'Sec-Fetch-Dest': 'document',
|
||||||
|
'Sec-Fetch-Mode': 'navigate',
|
||||||
|
'Sec-Fetch-Site': 'none',
|
||||||
|
'Sec-Fetch-User': '?1',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
page = await context.new_page()
|
||||||
|
|
||||||
|
# Remove webdriver property to avoid detection
|
||||||
|
await page.add_init_script("""
|
||||||
|
Object.defineProperty(navigator, 'webdriver', {
|
||||||
|
get: () => undefined
|
||||||
|
});
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Navigate to the page
|
||||||
|
await page.goto(link, wait_until='networkidle', timeout=60000)
|
||||||
|
|
||||||
|
# Wait a bit to ensure everything is loaded
|
||||||
|
await page.wait_for_timeout(2000)
|
||||||
|
|
||||||
|
# Generate PDF
|
||||||
|
await page.pdf(path=pdf_path, format='A4', print_background=True)
|
||||||
|
|
||||||
|
await context.close()
|
||||||
await browser.close()
|
await browser.close()
|
||||||
|
|
||||||
if os.path.exists(pdf_path):
|
if os.path.exists(pdf_path):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue