cosmoguard-bd/old/pdf_extraction.py

import os
import base64
import traceback
import logging # Import logging module
import datetime
import pandas as pd
# import time # Keep if you use page.wait_for_timeout
from playwright.sync_api import sync_playwright, TimeoutError # Catch specific errors
from src.func.find import search_dossier
import requests

# --- Basic Logging Setup (Commented Out) ---
# # Configure logging - uncomment and customize level/handler as needed
# logging.basicConfig(
#     level=logging.INFO, # Or DEBUG for more details
#     format='%(asctime)s - %(levelname)s - %(message)s',
#     # filename='pdf_generator.log', # Optional: Log to a file
#     # filemode='a'
# )
# --- End Logging Setup ---


# Assume svg_to_data_uri is defined elsewhere correctly
def svg_to_data_uri(svg_path):
    try:
        if not os.path.exists(svg_path):
            # logging.error(f"SVG file not found: {svg_path}") # Example logging
            raise FileNotFoundError(f"SVG file not found: {svg_path}")
        with open(svg_path, 'rb') as f:
            svg_content = f.read()
        encoded_svg = base64.b64encode(svg_content).decode('utf-8')
        return f"data:image/svg+xml;base64,{encoded_svg}"
    except Exception as e:
        print(f"Error converting SVG {svg_path}: {e}")
        # logging.error(f"Error converting SVG {svg_path}: {e}", exc_info=True) # Example logging
        return None

# --- JavaScript Expressions ---

# Define the cleanup logic as an immediately-invoked arrow function expression
# NOTE: .das-block_empty removal is currently disabled as per previous step
cleanup_js_expression = """
() => {
    console.log('Running cleanup JS (DISABLED .das-block_empty removal)...');
    let totalRemoved = 0;

    // Example 1: Remove sections explicitly marked as empty (Currently Disabled)
    // const emptyBlocks = document.querySelectorAll('.das-block_empty');
    // emptyBlocks.forEach(el => {
    //     if (el && el.parentNode) {
    //         console.log(`Removing '.das-block_empty' block with ID: ${el.id || 'N/A'}`);
    //         el.remove();
    //         totalRemoved++;
    //     }
    // });

    // Add other specific cleanup logic here if needed

    console.log(`Cleanup script removed ${totalRemoved} elements (DISABLED .das-block_empty removal).`);
    return totalRemoved; // Return the count
}
"""
# --- End JavaScript Expressions ---


def generate_pdf_with_header_and_cleanup(
    url,
    pdf_path,
    substance_name,
    substance_link,
    ec_number,
    cas_number,
    header_template_path=r"src\func\resources\injectableHeader.html",
    echa_chem_logo_path=r"src\func\resources\echa_chem_logo.svg",
    echa_logo_path=r"src\func\resources\ECHA_Logo.svg"
) -> bool: # Added return type hint
    """
    Generates a PDF with a dynamic header and optionally removes empty sections.
    Provides basic logging (commented out) and returns True/False for success/failure.

    Args:
        url (str): The target URL OR local HTML file path.
        pdf_path (str): The output PDF path.
        substance_name (str): The name of the chemical substance.
        substance_link (str): The URL the substance name should link to (in header).
        ec_number (str): The EC number for the substance.
        cas_number (str): The CAS number for the substance.
        header_template_path (str): Path to the HTML header template file.
        echa_chem_logo_path (str): Path to the echa_chem_logo.svg file.
        echa_logo_path (str): Path to the ECHA_Logo.svg file.

    Returns:
        bool: True if the PDF was generated successfully, False otherwise.
    """
    final_header_html = None
    # logging.info(f"Starting PDF generation for URL: {url} to path: {pdf_path}") # Example logging

    # --- 1. Prepare Header HTML ---
    try:
        # logging.debug(f"Reading header template from: {header_template_path}") # Example logging
        print(f"Reading header template from: {header_template_path}")
        if not os.path.exists(header_template_path):
            raise FileNotFoundError(f"Header template file not found: {header_template_path}")
        with open(header_template_path, 'r', encoding='utf-8') as f:
            header_template_content = f.read()
        if not header_template_content:
             raise ValueError("Header template file is empty.")

        # logging.debug("Converting logos...") # Example logging
        print("Converting logos...")
        logo1_data_uri = svg_to_data_uri(echa_chem_logo_path)
        logo2_data_uri = svg_to_data_uri(echa_logo_path)
        if not logo1_data_uri or not logo2_data_uri:
            raise ValueError("Failed to convert one or both logos to Data URIs.")

        # logging.debug("Replacing placeholders...") # Example logging
        print("Replacing placeholders...")
        final_header_html = header_template_content.replace("##ECHA_CHEM_LOGO_SRC##", logo1_data_uri)
        final_header_html = final_header_html.replace("##ECHA_LOGO_SRC##", logo2_data_uri)
        final_header_html = final_header_html.replace("##SUBSTANCE_NAME##", substance_name)
        final_header_html = final_header_html.replace("##SUBSTANCE_LINK##", substance_link)
        final_header_html = final_header_html.replace("##EC_NUMBER##", ec_number)
        final_header_html = final_header_html.replace("##CAS_NUMBER##", cas_number)

        if "##" in final_header_html:
             print("Warning: Not all placeholders seem replaced in the header HTML.")
             # logging.warning("Not all placeholders seem replaced in the header HTML.") # Example logging

    except Exception as e:
        print(f"Error during header setup phase: {e}")
        traceback.print_exc()
        # logging.error(f"Error during header setup phase: {e}", exc_info=True) # Example logging
        return False # Return False on header setup failure
    # --- End Header Prep ---

    # --- CSS Override Definition ---
    # Using Revision 4 from previous step (simplified breaks, boundary focus)
    selectors_to_fix = [
        '.das-field .das-field_value_html',
        '.das-field .das-field_value_large',
        '.das-field .das-value_remark-text'
    ]
    css_selector_string = ",\n".join(selectors_to_fix)
    css_override = f"""
<style id='pdf-override-styles'>
    /* Basic Resets & Overflows */
    html, body {{ height: auto !important; overflow: visible !important; margin: 0 !important; padding: 0 !important; }}
    * {{ box-sizing: border-box; }}
    {css_selector_string} {{
        overflow: visible !important; overflow-y: visible !important; height: auto !important; max-height: none !important;
    }}
    /* Boundary Fix */
    #pdf-custom-header {{ margin-bottom: 0 !important; padding-bottom: 1px !important; page-break-after: auto !important; display: block !important; }}
    #pdf-custom-header + .body-inner {{ margin-top: 0 !important; padding-top: 0 !important; page-break-before: auto !important; display: block !important; }}
    .body-inner .document-header {{ margin-top: 0 !important; padding-top: 0 !important; page-break-before: auto !important; }}
    /* Simplified Page Breaks */
    .body-inner h1, .body-inner h2, .body-inner h3, .body-inner h4, .body-inner h5, .body-inner h6 {{ page-break-after: avoid !important; }}
    #pdf-custom-header h2 {{ page-break-after: auto !important; }}
    @media print {{
        html, body {{ height: auto !important; overflow: visible !important; margin: 0; padding: 0; }}
        #pdf-custom-header {{ margin-bottom: 0 !important; padding-bottom: 1px !important; page-break-after: auto !important; display: block !important;}}
        #pdf-custom-header + .body-inner {{ margin-top: 0 !important; padding-top: 0 !important; page-break-before: auto !important; display: block !important; }}
        .body-inner .document-header {{ margin-top: 0 !important; padding-top: 0 !important; page-break-before: auto !important; }}
        .body-inner h1, .body-inner h2, .body-inner h3, .body-inner h4, .body-inner h5, .body-inner h6 {{ page-break-after: avoid !important; }}
        #pdf-custom-header h2 {{ page-break-after: auto !important; }}
        .das-doc-toolbar, .document-header__section-links, #das-totop {{ display: none !important; }}
    }}
</style>
"""
    # --- End CSS Override Definition ---

    # --- Playwright Automation ---
    try:
        with sync_playwright() as p:
            # logging.debug("Launching browser...") # Example logging
            # browser = p.chromium.launch(headless=False, devtools=True) # For debugging
            browser = p.chromium.launch()
            page = browser.new_page()
            # Capture console messages (Corrected: use msg.text)
            page.on("console", lambda msg: print(f"Browser Console: {msg.text}"))

            try:
                # logging.info(f"Navigating to page: {url}") # Example logging
                print(f"Navigating to: {url}")
                if os.path.exists(url) and not url.startswith('file://'):
                     page_url = f'file://{os.path.abspath(url)}'
                     # logging.info(f"Treating as local file: {page_url}") # Example logging
                     print(f"Treating as local file: {page_url}")
                else:
                     page_url = url

                page.goto(page_url, wait_until='load', timeout=90000)
                # logging.info("Page navigation complete.") # Example logging

                # logging.debug("Injecting header HTML...") # Example logging
                print("Injecting header HTML...")
                page.evaluate(f'(headerHtml) => {{ document.body.insertAdjacentHTML("afterbegin", headerHtml); }}', final_header_html)

                # logging.debug("Injecting CSS overrides...") # Example logging
                print("Injecting CSS overrides...")
                page.evaluate(f"""(css) => {{
                    const existingStyle = document.getElementById('pdf-override-styles');
                    if (existingStyle) existingStyle.remove();
                    document.head.insertAdjacentHTML('beforeend', css);
                }}""", css_override)

                # logging.debug("Running JavaScript cleanup function...") # Example logging
                print("Running JavaScript cleanup function...")
                elements_removed_count = page.evaluate(cleanup_js_expression)
                # logging.info(f"Cleanup script finished (reported removing {elements_removed_count} elements).") # Example logging
                print(f"Cleanup script finished (reported removing {elements_removed_count} elements).")


                # --- Optional: Emulate Print Media ---
                # print("Emulating print media...")
                # page.emulate_media(media='print')

                # --- Generate PDF ---
                # logging.info(f"Generating PDF: {pdf_path}") # Example logging
                print(f"Generating PDF: {pdf_path}")
                pdf_options = {
                    "path": pdf_path, "format": "A4", "print_background": True,
                    "margin": {'top': '20px', 'bottom': '20px', 'left': '20px', 'right': '20px'},
                    "scale": 1.0
                }
                page.pdf(**pdf_options)
                # logging.info(f"PDF saved successfully to: {pdf_path}") # Example logging
                print(f"PDF saved successfully to: {pdf_path}")

                # logging.debug("Closing browser.") # Example logging
                print("Closing browser.")
                browser.close()
                return True # Indicate success

            except TimeoutError as e:
                print(f"A Playwright TimeoutError occurred: {e}")
                traceback.print_exc()
                # logging.error(f"Playwright TimeoutError occurred: {e}", exc_info=True) # Example logging
                browser.close() # Ensure browser is closed on error
                return False # Indicate failure
            except Exception as e: # Catch other potential errors during Playwright page operations
                print(f"An unexpected error occurred during Playwright page operations: {e}")
                traceback.print_exc()
                # logging.error(f"Unexpected error during Playwright page operations: {e}", exc_info=True) # Example logging
                 # Optional: Save HTML state on error
                try:
                   html_content = page.content()
                   error_html_path = pdf_path.replace('.pdf', '_error.html')
                   with open(error_html_path, 'w', encoding='utf-8') as f_err:
                       f_err.write(html_content)
                   # logging.info(f"Saved HTML state on error to: {error_html_path}") # Example logging
                   print(f"Saved HTML state on error to: {error_html_path}")
                except Exception as save_e:
                   # logging.error(f"Could not save HTML state on error: {save_e}", exc_info=True) # Example logging
                   print(f"Could not save HTML state on error: {save_e}")
                browser.close() # Ensure browser is closed on error
                return False # Indicate failure
            # Note: The finally block for the 'with sync_playwright()' context
            # is handled automatically by the 'with' statement.

    except Exception as e:
        # Catch errors during Playwright startup (less common)
        print(f"An error occurred during Playwright setup/teardown: {e}")
        traceback.print_exc()
        # logging.error(f"Error during Playwright setup/teardown: {e}", exc_info=True) # Example logging
        return False # Indicate failure


# --- Example Usage ---
# result = generate_pdf_with_header_and_cleanup(
#     url='path/to/your/input.html',
#     pdf_path='output.pdf',
#     substance_name='Glycerol Example',
#     substance_link='http://example.com/glycerol',
#     ec_number='200-289-5',
#     cas_number='56-81-5',
# )
#
# if result:
#     print("PDF Generation Succeeded.")
#     # logging.info("Main script: PDF Generation Succeeded.") # Example logging
# else:
#     print("PDF Generation Failed.")
#     # logging.error("Main script: PDF Generation Failed.") # Example logging


def search_generate_pdfs(
    cas_number_to_search: str,
    page_types_to_extract: list[str],
    base_output_folder: str = "data/library"
) -> bool:
    """
    Searches for a substance by CAS, saves raw HTML and generates PDFs for
    specified page types. Uses '_js' link variant for the PDF header link if available.

    Args:
        cas_number_to_search (str): CAS number to search for.
        page_types_to_extract (list[str]): List of page type names (e.g., 'RepeatedDose').
                                           Expects '{page_type}' and '{page_type}_js' keys
                                           in the search result.
        base_output_folder (str): Root directory for saving HTML/PDFs.

    Returns:
        bool: True if substance found and >=1 requested PDF generated, False otherwise.
    """
    # logging.info(f"Starting process for CAS: {cas_number_to_search}")
    print(f"\n===== Processing request for CAS: {cas_number_to_search} =====")

    # --- 1. Search for Dossier Information ---
    try:
        # logging.debug(f"Calling search_dossier for CAS: {cas_number_to_search}")
        search_result = search_dossier(substance=cas_number_to_search, input_type='rmlCas')
    except Exception as e:
        print(f"Error during dossier search for CAS '{cas_number_to_search}': {e}")
        traceback.print_exc()
        # logging.error(f"Exception during search_dossier for CAS '{cas_number_to_search}': {e}", exc_info=True)
        return False

    if not search_result:
        print(f"Substance not found or search failed for CAS: {cas_number_to_search}")
        # logging.warning(f"Substance not found or search failed for CAS: {cas_number_to_search}")
        return False

    # logging.info(f"Substance found for CAS: {cas_number_to_search}")
    print(f"Substance found: {search_result.get('rmlName', 'N/A')}")

    # --- 2. Extract Details and Filter Pages ---
    try:
        # Extract required info
        rml_id = search_result.get('rmlId')
        rml_name = search_result.get('rmlName')
        rml_cas = search_result.get('rmlCas')
        rml_ec = search_result.get('rmlEc')
        asset_ext_id = search_result.get('assetExternalId')

        # Basic validation
        if not all([rml_id, rml_name, rml_cas, rml_ec, asset_ext_id]):
             missing_keys = [k for k, v in {'rmlId': rml_id, 'rmlName': rml_name, 'rmlCas': rml_cas, 'rmlEc': rml_ec, 'assetExternalId': asset_ext_id}.items() if not v]
             message = f"Search result for {cas_number_to_search} is missing required keys: {missing_keys}"
             print(f"Error: {message}")
             # logging.error(message)
             return False

        # --- Filtering Logic - Collect pairs of URLs ---
        pages_to_process_list = [] # Store tuples: (page_name, raw_url, js_url)
        # logging.debug(f"Filtering pages. Requested: {page_types_to_extract}.")

        for page_type in page_types_to_extract:
            raw_url_key = page_type
            js_url_key = f"{page_type}_js"

            raw_url = search_result.get(raw_url_key)
            js_url = search_result.get(js_url_key) # Get the JS URL

            # Check if both URLs are valid strings
            if raw_url and isinstance(raw_url, str) and raw_url.strip():
                if js_url and isinstance(js_url, str) and js_url.strip():
                     pages_to_process_list.append((page_type, raw_url, js_url))
                     # logging.debug(f"Found valid pair for '{page_type}': Raw='{raw_url}', JS='{js_url}'")
                else:
                    # Found raw URL but not a valid JS URL - skip this page type for PDF?
                    # Or use raw_url for header too? Let's skip if JS URL is missing/invalid.
                    print(f"Found raw URL for '{page_type}' but missing/invalid JS URL ('{js_url}'). Skipping PDF generation for this type.")
                    # logging.warning(f"Missing/invalid JS URL for page type '{page_type}' for {rml_cas}. Raw URL: '{raw_url}'.")
            else:
                # Raw URL missing or invalid
                if page_type in search_result: # Check if key existed at all
                     print(f"Found page type key '{page_type}' for {rml_cas}, but its value is not a valid URL ('{raw_url}'). Skipping.")
                     # logging.warning(f"Invalid raw URL value for page type '{page_type}' for {rml_cas}: '{raw_url}'.")
                else:
                     print(f"Requested page type key '{page_type}' not found in search results for {rml_cas}.")
                     # logging.warning(f"Requested page type key '{page_type}' not found for {rml_cas}.")
        # --- End Filtering Logic ---

        if not pages_to_process_list:
            print(f"After filtering, no requested page types ({page_types_to_extract}) resulted in a valid pair of Raw and JS URLs for substance {rml_cas}.")
            # logging.warning(f"No pages with valid URL pairs to process for substance {rml_cas}.")
            return False # Nothing to generate

    except Exception as e:
        print(f"Error processing search result for '{cas_number_to_search}': {e}")
        traceback.print_exc()
        # logging.error(f"Error processing search result for '{cas_number_to_search}': {e}", exc_info=True)
        return False

    # --- 3. Prepare Folders ---
    safe_cas = rml_cas.replace('/', '_').replace('\\', '_')
    substance_folder_name = f"{safe_cas}_{rml_ec}_{rml_id}"
    substance_folder_path = os.path.join(base_output_folder, substance_folder_name)

    try:
        os.makedirs(substance_folder_path, exist_ok=True)
        # logging.info(f"Ensured output directory exists: {substance_folder_path}")
        print(f"Ensured output directory exists: {substance_folder_path}")
    except OSError as e:
        print(f"Error creating directory {substance_folder_path}: {e}")
        # logging.error(f"Failed to create directory {substance_folder_path}: {e}", exc_info=True)
        return False


    # --- 4. Process Each Page (Save HTML, Generate PDF) ---
    successful_pages = [] # Track successful PDF generations
    overall_success = False # Track if any PDF was generated

    for page_name, raw_html_url, js_header_link in pages_to_process_list:
        print(f"\nProcessing page: {page_name}")
        base_filename = f"{safe_cas}_{page_name}"
        html_filename = f"{base_filename}.html"
        pdf_filename = f"{base_filename}.pdf"
        html_full_path = os.path.join(substance_folder_path, html_filename)
        pdf_full_path = os.path.join(substance_folder_path, pdf_filename)

        # --- Save Raw HTML ---
        html_saved = False
        try:
            # logging.debug(f"Fetching raw HTML for {page_name} from {raw_html_url}")
            print(f"Fetching raw HTML from: {raw_html_url}")
            # Add headers to mimic a browser slightly if needed
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
            response = requests.get(raw_html_url, timeout=30, headers=headers) # 30s timeout
            response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

            # Decide encoding - response.text tries to guess, or use apparent_encoding
            # Or assume utf-8 if unsure, which is common.
            html_content = response.content.decode('utf-8', errors='replace')

            with open(html_full_path, 'w', encoding='utf-8') as f:
                f.write(html_content)
            html_saved = True
            # logging.info(f"Successfully saved raw HTML for {page_name} to {html_full_path}")
            print(f"Successfully saved raw HTML to: {html_full_path}")
        except requests.exceptions.RequestException as req_e:
            print(f"Error fetching raw HTML for {page_name} from {raw_html_url}: {req_e}")
            # logging.error(f"Error fetching raw HTML for {page_name}: {req_e}", exc_info=True)
        except IOError as io_e:
            print(f"Error saving raw HTML for {page_name} to {html_full_path}: {io_e}")
            # logging.error(f"Error saving raw HTML for {page_name}: {io_e}", exc_info=True)
        except Exception as e: # Catch other potential errors like decoding
             print(f"Unexpected error saving HTML for {page_name}: {e}")
             # logging.error(f"Unexpected error saving HTML for {page_name}: {e}", exc_info=True)

        # --- Generate PDF (using raw URL for content, JS URL for header link) ---
        # logging.info(f"Generating PDF for {page_name} from {raw_html_url}")
        print(f"Generating PDF using content from: {raw_html_url}")
        pdf_success = generate_pdf_with_header_and_cleanup(
            url=raw_html_url,             # Use raw URL for Playwright navigation/content
            pdf_path=pdf_full_path,
            substance_name=rml_name,
            substance_link=js_header_link, # Use JS URL for the link in the header
            ec_number=rml_ec,
            cas_number=rml_cas
        )

        if pdf_success:
            successful_pages.append(page_name) # Log success based on PDF generation
            overall_success = True
            # logging.info(f"Successfully generated PDF for {page_name} at {pdf_full_path}")
            print(f"Successfully generated PDF for {page_name}")
        else:
            # logging.error(f"Failed to generate PDF for {page_name} from {raw_html_url}")
            print(f"Failed to generate PDF for {page_name}")
        # Decide if failure to save HTML should affect overall success or logging...
        # Currently, success is tied only to PDF generation.

    print(f"===== Finished request for CAS: {cas_number_to_search} =====")
    print(f"Successfully generated {len(successful_pages)} PDFs: {successful_pages}")
    return overall_success # Return success based on PDF generation