import os import base64 import traceback import logging # Import logging module import datetime import pandas as pd # import time # Keep if you use page.wait_for_timeout from playwright.sync_api import sync_playwright, TimeoutError # Catch specific errors from src.func.find import search_dossier import requests # --- Basic Logging Setup (Commented Out) --- # # Configure logging - uncomment and customize level/handler as needed # logging.basicConfig( # level=logging.INFO, # Or DEBUG for more details # format='%(asctime)s - %(levelname)s - %(message)s', # # filename='pdf_generator.log', # Optional: Log to a file # # filemode='a' # ) # --- End Logging Setup --- # Assume svg_to_data_uri is defined elsewhere correctly def svg_to_data_uri(svg_path): try: if not os.path.exists(svg_path): # logging.error(f"SVG file not found: {svg_path}") # Example logging raise FileNotFoundError(f"SVG file not found: {svg_path}") with open(svg_path, 'rb') as f: svg_content = f.read() encoded_svg = base64.b64encode(svg_content).decode('utf-8') return f"data:image/svg+xml;base64,{encoded_svg}" except Exception as e: print(f"Error converting SVG {svg_path}: {e}") # logging.error(f"Error converting SVG {svg_path}: {e}", exc_info=True) # Example logging return None # --- JavaScript Expressions --- # Define the cleanup logic as an immediately-invoked arrow function expression # NOTE: .das-block_empty removal is currently disabled as per previous step cleanup_js_expression = """ () => { console.log('Running cleanup JS (DISABLED .das-block_empty removal)...'); let totalRemoved = 0; // Example 1: Remove sections explicitly marked as empty (Currently Disabled) // const emptyBlocks = document.querySelectorAll('.das-block_empty'); // emptyBlocks.forEach(el => { // if (el && el.parentNode) { // console.log(`Removing '.das-block_empty' block with ID: ${el.id || 'N/A'}`); // el.remove(); // totalRemoved++; // } // }); // Add other specific cleanup logic here if needed console.log(`Cleanup script removed ${totalRemoved} elements (DISABLED .das-block_empty removal).`); return totalRemoved; // Return the count } """ # --- End JavaScript Expressions --- def generate_pdf_with_header_and_cleanup( url, pdf_path, substance_name, substance_link, ec_number, cas_number, header_template_path=r"src\func\resources\injectableHeader.html", echa_chem_logo_path=r"src\func\resources\echa_chem_logo.svg", echa_logo_path=r"src\func\resources\ECHA_Logo.svg" ) -> bool: # Added return type hint """ Generates a PDF with a dynamic header and optionally removes empty sections. Provides basic logging (commented out) and returns True/False for success/failure. Args: url (str): The target URL OR local HTML file path. pdf_path (str): The output PDF path. substance_name (str): The name of the chemical substance. substance_link (str): The URL the substance name should link to (in header). ec_number (str): The EC number for the substance. cas_number (str): The CAS number for the substance. header_template_path (str): Path to the HTML header template file. echa_chem_logo_path (str): Path to the echa_chem_logo.svg file. echa_logo_path (str): Path to the ECHA_Logo.svg file. Returns: bool: True if the PDF was generated successfully, False otherwise. """ final_header_html = None # logging.info(f"Starting PDF generation for URL: {url} to path: {pdf_path}") # Example logging # --- 1. Prepare Header HTML --- try: # logging.debug(f"Reading header template from: {header_template_path}") # Example logging print(f"Reading header template from: {header_template_path}") if not os.path.exists(header_template_path): raise FileNotFoundError(f"Header template file not found: {header_template_path}") with open(header_template_path, 'r', encoding='utf-8') as f: header_template_content = f.read() if not header_template_content: raise ValueError("Header template file is empty.") # logging.debug("Converting logos...") # Example logging print("Converting logos...") logo1_data_uri = svg_to_data_uri(echa_chem_logo_path) logo2_data_uri = svg_to_data_uri(echa_logo_path) if not logo1_data_uri or not logo2_data_uri: raise ValueError("Failed to convert one or both logos to Data URIs.") # logging.debug("Replacing placeholders...") # Example logging print("Replacing placeholders...") final_header_html = header_template_content.replace("##ECHA_CHEM_LOGO_SRC##", logo1_data_uri) final_header_html = final_header_html.replace("##ECHA_LOGO_SRC##", logo2_data_uri) final_header_html = final_header_html.replace("##SUBSTANCE_NAME##", substance_name) final_header_html = final_header_html.replace("##SUBSTANCE_LINK##", substance_link) final_header_html = final_header_html.replace("##EC_NUMBER##", ec_number) final_header_html = final_header_html.replace("##CAS_NUMBER##", cas_number) if "##" in final_header_html: print("Warning: Not all placeholders seem replaced in the header HTML.") # logging.warning("Not all placeholders seem replaced in the header HTML.") # Example logging except Exception as e: print(f"Error during header setup phase: {e}") traceback.print_exc() # logging.error(f"Error during header setup phase: {e}", exc_info=True) # Example logging return False # Return False on header setup failure # --- End Header Prep --- # --- CSS Override Definition --- # Using Revision 4 from previous step (simplified breaks, boundary focus) selectors_to_fix = [ '.das-field .das-field_value_html', '.das-field .das-field_value_large', '.das-field .das-value_remark-text' ] css_selector_string = ",\n".join(selectors_to_fix) css_override = f""" """ # --- End CSS Override Definition --- # --- Playwright Automation --- try: with sync_playwright() as p: # logging.debug("Launching browser...") # Example logging # browser = p.chromium.launch(headless=False, devtools=True) # For debugging browser = p.chromium.launch() page = browser.new_page() # Capture console messages (Corrected: use msg.text) page.on("console", lambda msg: print(f"Browser Console: {msg.text}")) try: # logging.info(f"Navigating to page: {url}") # Example logging print(f"Navigating to: {url}") if os.path.exists(url) and not url.startswith('file://'): page_url = f'file://{os.path.abspath(url)}' # logging.info(f"Treating as local file: {page_url}") # Example logging print(f"Treating as local file: {page_url}") else: page_url = url page.goto(page_url, wait_until='load', timeout=90000) # logging.info("Page navigation complete.") # Example logging # logging.debug("Injecting header HTML...") # Example logging print("Injecting header HTML...") page.evaluate(f'(headerHtml) => {{ document.body.insertAdjacentHTML("afterbegin", headerHtml); }}', final_header_html) # logging.debug("Injecting CSS overrides...") # Example logging print("Injecting CSS overrides...") page.evaluate(f"""(css) => {{ const existingStyle = document.getElementById('pdf-override-styles'); if (existingStyle) existingStyle.remove(); document.head.insertAdjacentHTML('beforeend', css); }}""", css_override) # logging.debug("Running JavaScript cleanup function...") # Example logging print("Running JavaScript cleanup function...") elements_removed_count = page.evaluate(cleanup_js_expression) # logging.info(f"Cleanup script finished (reported removing {elements_removed_count} elements).") # Example logging print(f"Cleanup script finished (reported removing {elements_removed_count} elements).") # --- Optional: Emulate Print Media --- # print("Emulating print media...") # page.emulate_media(media='print') # --- Generate PDF --- # logging.info(f"Generating PDF: {pdf_path}") # Example logging print(f"Generating PDF: {pdf_path}") pdf_options = { "path": pdf_path, "format": "A4", "print_background": True, "margin": {'top': '20px', 'bottom': '20px', 'left': '20px', 'right': '20px'}, "scale": 1.0 } page.pdf(**pdf_options) # logging.info(f"PDF saved successfully to: {pdf_path}") # Example logging print(f"PDF saved successfully to: {pdf_path}") # logging.debug("Closing browser.") # Example logging print("Closing browser.") browser.close() return True # Indicate success except TimeoutError as e: print(f"A Playwright TimeoutError occurred: {e}") traceback.print_exc() # logging.error(f"Playwright TimeoutError occurred: {e}", exc_info=True) # Example logging browser.close() # Ensure browser is closed on error return False # Indicate failure except Exception as e: # Catch other potential errors during Playwright page operations print(f"An unexpected error occurred during Playwright page operations: {e}") traceback.print_exc() # logging.error(f"Unexpected error during Playwright page operations: {e}", exc_info=True) # Example logging # Optional: Save HTML state on error try: html_content = page.content() error_html_path = pdf_path.replace('.pdf', '_error.html') with open(error_html_path, 'w', encoding='utf-8') as f_err: f_err.write(html_content) # logging.info(f"Saved HTML state on error to: {error_html_path}") # Example logging print(f"Saved HTML state on error to: {error_html_path}") except Exception as save_e: # logging.error(f"Could not save HTML state on error: {save_e}", exc_info=True) # Example logging print(f"Could not save HTML state on error: {save_e}") browser.close() # Ensure browser is closed on error return False # Indicate failure # Note: The finally block for the 'with sync_playwright()' context # is handled automatically by the 'with' statement. except Exception as e: # Catch errors during Playwright startup (less common) print(f"An error occurred during Playwright setup/teardown: {e}") traceback.print_exc() # logging.error(f"Error during Playwright setup/teardown: {e}", exc_info=True) # Example logging return False # Indicate failure # --- Example Usage --- # result = generate_pdf_with_header_and_cleanup( # url='path/to/your/input.html', # pdf_path='output.pdf', # substance_name='Glycerol Example', # substance_link='http://example.com/glycerol', # ec_number='200-289-5', # cas_number='56-81-5', # ) # # if result: # print("PDF Generation Succeeded.") # # logging.info("Main script: PDF Generation Succeeded.") # Example logging # else: # print("PDF Generation Failed.") # # logging.error("Main script: PDF Generation Failed.") # Example logging def search_generate_pdfs( cas_number_to_search: str, page_types_to_extract: list[str], base_output_folder: str = "data/library" ) -> bool: """ Searches for a substance by CAS, saves raw HTML and generates PDFs for specified page types. Uses '_js' link variant for the PDF header link if available. Args: cas_number_to_search (str): CAS number to search for. page_types_to_extract (list[str]): List of page type names (e.g., 'RepeatedDose'). Expects '{page_type}' and '{page_type}_js' keys in the search result. base_output_folder (str): Root directory for saving HTML/PDFs. Returns: bool: True if substance found and >=1 requested PDF generated, False otherwise. """ # logging.info(f"Starting process for CAS: {cas_number_to_search}") print(f"\n===== Processing request for CAS: {cas_number_to_search} =====") # --- 1. Search for Dossier Information --- try: # logging.debug(f"Calling search_dossier for CAS: {cas_number_to_search}") search_result = search_dossier(substance=cas_number_to_search, input_type='rmlCas') except Exception as e: print(f"Error during dossier search for CAS '{cas_number_to_search}': {e}") traceback.print_exc() # logging.error(f"Exception during search_dossier for CAS '{cas_number_to_search}': {e}", exc_info=True) return False if not search_result: print(f"Substance not found or search failed for CAS: {cas_number_to_search}") # logging.warning(f"Substance not found or search failed for CAS: {cas_number_to_search}") return False # logging.info(f"Substance found for CAS: {cas_number_to_search}") print(f"Substance found: {search_result.get('rmlName', 'N/A')}") # --- 2. Extract Details and Filter Pages --- try: # Extract required info rml_id = search_result.get('rmlId') rml_name = search_result.get('rmlName') rml_cas = search_result.get('rmlCas') rml_ec = search_result.get('rmlEc') asset_ext_id = search_result.get('assetExternalId') # Basic validation if not all([rml_id, rml_name, rml_cas, rml_ec, asset_ext_id]): missing_keys = [k for k, v in {'rmlId': rml_id, 'rmlName': rml_name, 'rmlCas': rml_cas, 'rmlEc': rml_ec, 'assetExternalId': asset_ext_id}.items() if not v] message = f"Search result for {cas_number_to_search} is missing required keys: {missing_keys}" print(f"Error: {message}") # logging.error(message) return False # --- Filtering Logic - Collect pairs of URLs --- pages_to_process_list = [] # Store tuples: (page_name, raw_url, js_url) # logging.debug(f"Filtering pages. Requested: {page_types_to_extract}.") for page_type in page_types_to_extract: raw_url_key = page_type js_url_key = f"{page_type}_js" raw_url = search_result.get(raw_url_key) js_url = search_result.get(js_url_key) # Get the JS URL # Check if both URLs are valid strings if raw_url and isinstance(raw_url, str) and raw_url.strip(): if js_url and isinstance(js_url, str) and js_url.strip(): pages_to_process_list.append((page_type, raw_url, js_url)) # logging.debug(f"Found valid pair for '{page_type}': Raw='{raw_url}', JS='{js_url}'") else: # Found raw URL but not a valid JS URL - skip this page type for PDF? # Or use raw_url for header too? Let's skip if JS URL is missing/invalid. print(f"Found raw URL for '{page_type}' but missing/invalid JS URL ('{js_url}'). Skipping PDF generation for this type.") # logging.warning(f"Missing/invalid JS URL for page type '{page_type}' for {rml_cas}. Raw URL: '{raw_url}'.") else: # Raw URL missing or invalid if page_type in search_result: # Check if key existed at all print(f"Found page type key '{page_type}' for {rml_cas}, but its value is not a valid URL ('{raw_url}'). Skipping.") # logging.warning(f"Invalid raw URL value for page type '{page_type}' for {rml_cas}: '{raw_url}'.") else: print(f"Requested page type key '{page_type}' not found in search results for {rml_cas}.") # logging.warning(f"Requested page type key '{page_type}' not found for {rml_cas}.") # --- End Filtering Logic --- if not pages_to_process_list: print(f"After filtering, no requested page types ({page_types_to_extract}) resulted in a valid pair of Raw and JS URLs for substance {rml_cas}.") # logging.warning(f"No pages with valid URL pairs to process for substance {rml_cas}.") return False # Nothing to generate except Exception as e: print(f"Error processing search result for '{cas_number_to_search}': {e}") traceback.print_exc() # logging.error(f"Error processing search result for '{cas_number_to_search}': {e}", exc_info=True) return False # --- 3. Prepare Folders --- safe_cas = rml_cas.replace('/', '_').replace('\\', '_') substance_folder_name = f"{safe_cas}_{rml_ec}_{rml_id}" substance_folder_path = os.path.join(base_output_folder, substance_folder_name) try: os.makedirs(substance_folder_path, exist_ok=True) # logging.info(f"Ensured output directory exists: {substance_folder_path}") print(f"Ensured output directory exists: {substance_folder_path}") except OSError as e: print(f"Error creating directory {substance_folder_path}: {e}") # logging.error(f"Failed to create directory {substance_folder_path}: {e}", exc_info=True) return False # --- 4. Process Each Page (Save HTML, Generate PDF) --- successful_pages = [] # Track successful PDF generations overall_success = False # Track if any PDF was generated for page_name, raw_html_url, js_header_link in pages_to_process_list: print(f"\nProcessing page: {page_name}") base_filename = f"{safe_cas}_{page_name}" html_filename = f"{base_filename}.html" pdf_filename = f"{base_filename}.pdf" html_full_path = os.path.join(substance_folder_path, html_filename) pdf_full_path = os.path.join(substance_folder_path, pdf_filename) # --- Save Raw HTML --- html_saved = False try: # logging.debug(f"Fetching raw HTML for {page_name} from {raw_html_url}") print(f"Fetching raw HTML from: {raw_html_url}") # Add headers to mimic a browser slightly if needed headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} response = requests.get(raw_html_url, timeout=30, headers=headers) # 30s timeout response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx) # Decide encoding - response.text tries to guess, or use apparent_encoding # Or assume utf-8 if unsure, which is common. html_content = response.content.decode('utf-8', errors='replace') with open(html_full_path, 'w', encoding='utf-8') as f: f.write(html_content) html_saved = True # logging.info(f"Successfully saved raw HTML for {page_name} to {html_full_path}") print(f"Successfully saved raw HTML to: {html_full_path}") except requests.exceptions.RequestException as req_e: print(f"Error fetching raw HTML for {page_name} from {raw_html_url}: {req_e}") # logging.error(f"Error fetching raw HTML for {page_name}: {req_e}", exc_info=True) except IOError as io_e: print(f"Error saving raw HTML for {page_name} to {html_full_path}: {io_e}") # logging.error(f"Error saving raw HTML for {page_name}: {io_e}", exc_info=True) except Exception as e: # Catch other potential errors like decoding print(f"Unexpected error saving HTML for {page_name}: {e}") # logging.error(f"Unexpected error saving HTML for {page_name}: {e}", exc_info=True) # --- Generate PDF (using raw URL for content, JS URL for header link) --- # logging.info(f"Generating PDF for {page_name} from {raw_html_url}") print(f"Generating PDF using content from: {raw_html_url}") pdf_success = generate_pdf_with_header_and_cleanup( url=raw_html_url, # Use raw URL for Playwright navigation/content pdf_path=pdf_full_path, substance_name=rml_name, substance_link=js_header_link, # Use JS URL for the link in the header ec_number=rml_ec, cas_number=rml_cas ) if pdf_success: successful_pages.append(page_name) # Log success based on PDF generation overall_success = True # logging.info(f"Successfully generated PDF for {page_name} at {pdf_full_path}") print(f"Successfully generated PDF for {page_name}") else: # logging.error(f"Failed to generate PDF for {page_name} from {raw_html_url}") print(f"Failed to generate PDF for {page_name}") # Decide if failure to save HTML should affect overall success or logging... # Currently, success is tied only to PDF generation. print(f"===== Finished request for CAS: {cas_number_to_search} =====") print(f"Successfully generated {len(successful_pages)} PDFs: {successful_pages}") return overall_success # Return success based on PDF generation