cosmoguard-bd/old/_old/find.py

import requests
import urllib.parse
import json
import logging
import re
from datetime import datetime
from bs4 import BeautifulSoup
from typing import Dict, Union, Optional, Any

# Settings per il logging
logging.basicConfig(
    format="{asctime} - {levelname} - {message}",
    style="{",
    datefmt="%Y-%m-%d %H:%M",
    filename=".log",
    encoding="utf-8",
    filemode="a",
    level=logging.INFO,
)


# Constants for API endpoints
QUACKO_BASE_URL = "https://chem.echa.europa.eu"
QUACKO_SUBSTANCE_API = f"{QUACKO_BASE_URL}/api-substance/v1/substance"
QUACKO_DOSSIER_API = f"{QUACKO_BASE_URL}/api-dossier-list/v1/dossier"
QUACKO_HTML_PAGES = f"{QUACKO_BASE_URL}/html-pages"

# Default sections to look for in the dossier
DEFAULT_SECTIONS = {
    "id_7_Toxicologicalinformation": "ToxSummary",
    "id_72_AcuteToxicity": "AcuteToxicity",
    "id_75_Repeateddosetoxicity": "RepeatedDose",
    "id_6_Ecotoxicologicalinformation": "EcotoxSummary",
    "id_76_Genetictoxicity" : 'GeneticToxicity',
    "id_42_Meltingpointfreezingpoint" : "MeltingFreezingPoint",
    "id_43_Boilingpoint" : "BoilingPoint",
    "id_48_Watersolubility" : "WaterSolubility",
    "id_410_Surfacetension" : "SurfaceTension",
    "id_420_pH" : "pH",
    "Test" : "Test2"
}

def search_dossier(
    substance: str,
    input_type: str = 'rmlCas',
    sections: Dict[str, str] = None,
    local_index_path: str = None
) -> Union[Dict[str, Any], str, bool]:
    """
    Search for a chemical substance in the QUACKO database and retrieve its dossier information.

    Args:
        substance (str): The identifier of the substance to search for (e.g. CAS number, name)
        input_type (str): The type of identifier provided. Options: 'rmlCas', 'rmlName', 'rmlEc'
        sections (Dict[str, str], optional): Dictionary mapping section IDs to result keys.
                                            If None, default sections will be used.
        local_index_path (str, optional): Path to a local index.html file to parse instead of
                                         downloading from QUACKO. If provided, the function will
                                         skip all API calls and only extract sections from the local file.

    Returns:
        Union[Dict[str, Any], str, bool]: Dictionary with substance information and dossier links on success,
                                         error message string if substance found but with issues,
                                         False if substance not found or other critical error
    """
    # Use default sections if none provided
    if sections is None:
        sections = DEFAULT_SECTIONS

    try:
        results = {}

        # If a local file is provided, extract sections from it directly
        if local_index_path:
            logging.info(f"QUACKO.search() - Using local index file: {local_index_path}")

            # We still need some minimal info for constructing the URLs
            if '/' not in local_index_path:
                asset_id = "local"
                rml_id = "local"
            else:
                # Try to extract information from the path if available
                path_parts = local_index_path.split('/')
                # If path follows expected structure: .../html-pages/ASSET_ID/index.html
                if 'html-pages' in path_parts and 'index.html' in path_parts[-1]:
                    asset_id = path_parts[path_parts.index('html-pages') + 1]
                    rml_id = "extracted"  # Just a placeholder
                else:
                    asset_id = "local"
                    rml_id = "local"

            # Add these to results for consistency
            results["assetExternalId"] = asset_id
            results["rmlId"] = rml_id

            # Extract sections from the local file
            section_links = get_section_links_from_file(local_index_path, asset_id, rml_id, sections)
            if section_links:
                results.update(section_links)

            return results

        # Normal flow with API calls
        substance_data = get_substance_by_identifier(substance)
        if not substance_data:
            return False

        # Verify that the found substance matches the input identifier
        if substance_data.get(input_type) != substance:
            error_msg = (f"Search error: results[{input_type}] (\"{substance_data.get(input_type)}\") "
                         f"is not equal to \"{substance}\". Maybe you specified the wrong input_type. "
                         f"Check the results here: {substance_data.get('search_response')}")
            logging.error(f"QUACKO.search(): {error_msg}")
            return error_msg

        # Step 2: Find dossiers for the substance
        rml_id = substance_data["rmlId"]
        dossier_data = get_dossier_by_rml_id(rml_id, substance)
        if not dossier_data:
            return False

        # Merge substance and dossier data
        results = {**substance_data, **dossier_data}

        # Step 3: Extract detailed information from dossier index page
        asset_external_id = dossier_data["assetExternalId"]
        section_links = get_section_links_from_index(asset_external_id, rml_id, sections)
        if section_links:
            results.update(section_links)

        logging.info(f"QUACKO.search() OK. output: {json.dumps(results)}")
        return results

    except Exception as e:
        logging.error(f"QUACKO.search(): Unexpected error in search_dossier for '{substance}': {str(e)}")
        return False


def get_substance_by_identifier(substance: str) -> Optional[Dict[str, str]]:
    """
    Search the QUACKO database for a substance using the provided identifier.

    Args:
        substance (str): The substance identifier to search for (CAS number, name, etc.)

    Returns:
        Optional[Dict[str, str]]: Dictionary with substance information or None if not found
    """
    encoded_substance = urllib.parse.quote(substance)
    search_url = f"{QUACKO_SUBSTANCE_API}?pageIndex=1&pageSize=100&searchText={encoded_substance}"

    logging.info(f'QUACKO.search(). searching "{substance}"')

    try:
        response = requests.get(search_url)
        response.raise_for_status()  # Raise exception for HTTP errors
        data = response.json()

        if not data.get("items") or len(data["items"]) == 0:
            logging.info(f"QUACKO.search() could not find substance for '{substance}'")
            return None

        # Extract substance information
        substance_index = data["items"][0]["substanceIndex"]
        result = {
            'search_response': search_url,
            'rmlId': substance_index.get("rmlId", ""),
            'rmlName': substance_index.get("rmlName", ""),
            'rmlCas': substance_index.get("rmlCas", ""),
            'rmlEc': substance_index.get("rmlEc", "")
        }

        logging.info(
            f"QUACKO.search() found substance on QUACKO. "
            f"rmlId: '{result['rmlId']}', rmlName: '{result['rmlName']}', rmlCas: '{result['rmlCas']}'"
        )
        return result

    except requests.RequestException as e:
        logging.error(f"QUACKO.search() - Request error while searching for substance '{substance}': {str(e)}")
        return None
    except (KeyError, IndexError) as e:
        logging.error(f"QUACKO.search() - Data parsing error for substance '{substance}': {str(e)}")
        return None


def get_dossier_by_rml_id(rml_id: str, substance_name: str) -> Optional[Dict[str, Any]]:
    """
    Find dossiers for a substance using its RML ID.

    Args:
        rml_id (str): The RML ID of the substance
        substance_name (str): The name of the substance (for logging)

    Returns:
        Optional[Dict[str, Any]]: Dictionary with dossier information or None if not found
    """
    # First try active dossiers
    dossier_results = _query_dossier_api(rml_id, "Active")

    # If no active dossiers found, try inactive ones
    if not dossier_results:
        logging.info(
            f"QUACKO.search() - could not find active dossier for '{substance_name}'. "
            "Proceeding to search in the unactive ones."
        )
        dossier_results = _query_dossier_api(rml_id, "Inactive")

        if not dossier_results:
            logging.info(f"QUACKO.search() - could not find unactive dossiers for '{substance_name}'")
            return None
        else:
            logging.info(f"QUACKO.search() - found unactive dossiers for '{substance_name}'")
            dossier_results["dossierType"] = "Inactive"
    else:
        logging.info(f"QUACKO.search() - found active dossiers for '{substance_name}'")
        dossier_results["dossierType"] = "Active"

    return dossier_results


def _query_dossier_api(rml_id: str, status: str) -> Optional[Dict[str, Any]]:
    """
    Helper function to query the QUACKO dossier API for a specific substance and status.

    Args:
        rml_id (str): The RML ID of the substance
        status (str): The status of dossiers to search for ('Active' or 'Inactive')

    Returns:
        Optional[Dict[str, Any]]: Dictionary with dossier information or None if not found
    """
    url = f"{QUACKO_DOSSIER_API}?pageIndex=1&pageSize=100&rmlId={rml_id}&registrationStatuses={status}"

    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        if not data.get("items") or len(data["items"]) == 0:
            return None

        result = {
            "assetExternalId": data["items"][0]["assetExternalId"],
            "rootKey": data["items"][0]["rootKey"],
        }

        # Extract last update date if available
        try:
            last_update = data["items"][0]["lastUpdatedDate"]
            datetime_object = datetime.fromisoformat(last_update.replace('Z', '+00:00'))
            result['lastUpdateDate'] = datetime_object.date().isoformat()
        except (KeyError, ValueError) as e:
            logging.error(f"QUACKO.search() - Error extracting lastUpdateDate: {str(e)}")

        # Add index URLs
        result["index"] = f"{QUACKO_HTML_PAGES}/{result['assetExternalId']}/index.html"
        result["index_js"] = f"{QUACKO_BASE_URL}/{rml_id}/dossier-view/{result['assetExternalId']}"

        return result

    except requests.RequestException as e:
        logging.error(f"QUACKO.search() - Request error while getting dossiers for RML ID '{rml_id}': {str(e)}")
        return None
    except (KeyError, IndexError) as e:
        logging.error(f"QUACKO.search() - Data parsing error for RML ID '{rml_id}': {str(e)}")
        return None


def get_section_links_from_index(
    asset_id: str,
    rml_id: str,
    sections: Dict[str, str]
) -> Dict[str, str]:
    """
    Extract links to specified sections from the dossier index page by downloading it.

    Args:
        asset_id (str): The asset external ID of the dossier
        rml_id (str): The RML ID of the substance
        sections (Dict[str, str]): Dictionary mapping section IDs to result keys

    Returns:
        Dict[str, str]: Dictionary with links to the requested sections
    """
    index_url = f"{QUACKO_HTML_PAGES}/{asset_id}/index.html"

    try:
        response = requests.get(index_url)
        response.raise_for_status()

        # Parse content using the shared method
        return parse_sections_from_html(response.text, asset_id, rml_id, sections)

    except requests.RequestException as e:
        logging.error(f"QUACKO.search() - Request error while extracting section links: {str(e)}")
        return {}
    except Exception as e:
        logging.error(f"QUACKO.search() - Error extracting section links: {str(e)}")
        return {}


def get_section_links_from_file(
    file_path: str,
    asset_id: str,
    rml_id: str,
    sections: Dict[str, str]
) -> Dict[str, str]:
    """
    Extract links to specified sections from a local index.html file.

    Args:
        file_path (str): Path to the local index.html file
        asset_id (str): The asset external ID to use for constructing URLs
        rml_id (str): The RML ID to use for constructing URLs
        sections (Dict[str, str]): Dictionary mapping section IDs to result keys

    Returns:
        Dict[str, str]: Dictionary with links to the requested sections
    """
    try:
        if not os.path.exists(file_path):
            logging.error(f"QUACKO.search() - Local file not found: {file_path}")
            return {}

        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

        # Parse content using the shared method
        return parse_sections_from_html(html_content, asset_id, rml_id, sections)

    except FileNotFoundError:
        logging.error(f"QUACKO.search() - File not found: {file_path}")
        return {}
    except Exception as e:
        logging.error(f"QUACKO.search() - Error parsing local file {file_path}: {str(e)}")
        return {}


def parse_sections_from_html(
    html_content: str,
    asset_id: str,
    rml_id: str,
    sections: Dict[str, str]
) -> Dict[str, str]:
    """
    Parse HTML content to extract links to specified sections.

    Args:
        html_content (str): HTML content to parse
        asset_id (str): The asset external ID to use for constructing URLs
        rml_id (str): The RML ID to use for constructing URLs
        sections (Dict[str, str]): Dictionary mapping section IDs to result keys

    Returns:
        Dict[str, str]: Dictionary with links to the requested sections
    """
    result = {}

    try:
        soup = BeautifulSoup(html_content, "html.parser")

        # Extract each requested section
        for section_id, section_name in sections.items():
            section_links = extract_section_links(soup, section_id, asset_id, rml_id, section_name)
            if section_links:
                result.update(section_links)
                logging.info(f"QUACKO.search() - Found section '{section_name}' in document")
            else:
                logging.info(f"QUACKO.search() - Section '{section_name}' not found in document")

        return result

    except Exception as e:
        logging.error(f"QUACKO.search() - Error parsing HTML content: {str(e)}")
        return {}


# --------------------------------------------------------------------------
# Function to Extract Section Links with Validation
# --------------------------------------------------------------------------
# This function extracts the document link associated with a specific section ID
# from the QUACKO index.html page structure.
#
# Problem Solved:
#   Previous attempts faced issues where searching for a link within a parent
#   section's div (e.g., "7 Toxicological Information" with id="id_7_...")
#   would incorrectly grab the link belonging to the *first child* section
#   (e.g., "7.2 Acute Toxicity" with id="id_72_..."). This happened because
#   the simple `find("a", href=True)` doesn't distinguish ownership when nested.
#
# Solution Logic:
#   1. Find Target Div: Locate the `div` element using the specific `section_id` provided.
#      This div typically contains the section's content or nested subsections.
#   2. Find First Link: Find the very first `<a>` tag that has an `href` attribute
#      somewhere *inside* the `target_div`.
#   3. Find Link's Owning Section Div: Starting from the `first_link_tag`, traverse
#      up the HTML tree using `find_parent()` to find the nearest ancestor `div`
#      whose `id` attribute starts with "id_" (the pattern for section containers).
#   4. Validate Ownership: Compare the `id` of the `link_ancestor_section_div` found
#      in step 3 with the original `section_id` passed into the function.
#   5. Decision:
#      - If the IDs MATCH: It confirms that the `first_link_tag` truly belongs to the
#        `section_id` we are querying. The function proceeds to extract and format
#        this link.
#      - If the IDs DO NOT MATCH: It indicates that the first link found actually
#        belongs to a *nested* subsection div. Therefore, the original `section_id`
#        (the parent/container) does not have its own direct link, and the function
#        correctly returns an empty dictionary for this `section_id`.
#
# This validation step ensures that we only return links that are directly
# associated with the queried section ID, preventing the inheritance bug.
# --------------------------------------------------------------------------
def extract_section_links(
    soup: BeautifulSoup,
    section_id: str,
    asset_id: str,
    rml_id: str,
    section_name: str
) -> Dict[str, str]:
    """
    Extracts a link for a specific section ID by finding the first link
    within its div and verifying that the link belongs directly to that
    section, not a nested subsection.

    Args:
        soup (BeautifulSoup): The BeautifulSoup object of the index page.
        section_id (str): The HTML ID of the section div.
        asset_id (str): The asset external ID of the dossier.
        rml_id (str): The RML ID of the substance.
        section_name (str): The name to use for the section in the result.

    Returns:
        Dict[str, str]: Dictionary with link if found and validated,
                        otherwise empty.
    """
    result = {}

    # 1. Find the target div for the section ID
    target_div = soup.find("div", id=section_id)
    if not target_div:
        logging.info(f"QUACKO.search() - extract_section_links(): No div found for id='{section_id}'")
        return result

    # 2. Find the first <a> tag with an href within this target div
    first_link_tag = target_div.find("a", href=True)
    if not first_link_tag:
        logging.info(f"QUACKO.search() - extract_section_links(): No 'a' tag with href found within div id='{section_id}'")
        return result # No links at all within this section

    # 3. Validate: Find the closest ancestor div with an ID starting with "id_"
    #    This tells us which section container the link *actually* resides in.
    #    We use a lambda function for the id check.
    #    Need to handle potential None if the structure is unexpected.
    link_ancestor_section_div: Optional[Tag] = first_link_tag.find_parent(
        "div", id=lambda x: x and x.startswith("id_")
    )

    # 4. Compare IDs
    if link_ancestor_section_div and link_ancestor_section_div.get('id') == section_id:
        # The first link found belongs directly to the section we are looking for.
        logging.debug(f"QUACKO.search() - extract_section_links(): Valid link found for id='{section_id}'.")
        a_tag_to_use = first_link_tag # Use the link we found
    else:
        # The first link found belongs to a *different* (nested) section
        # or the structure is broken (no ancestor div with id found).
        # Therefore, the section_id we were originally checking has no direct link.
        ancestor_id = link_ancestor_section_div.get('id') if link_ancestor_section_div else "None"
        logging.info(f"QUACKO.search() - extract_section_links(): First link within id='{section_id}' belongs to ancestor id='{ancestor_id}'. No direct link for '{section_id}'.")
        return result # Return empty dict

    # 5. Proceed with link extraction using the validated a_tag_to_use
    try:
        document_id = a_tag_to_use.get('href') # Use .get() for safety
        if not document_id:
             logging.error(f"QUACKO.search() - extract_section_links(): Found 'a' tag for '{section_name}' has no href attribute.")
             return {}

        # Clean up the document ID
        if document_id.startswith('./documents/'):
            document_id = document_id.replace('./documents/', '')
        if document_id.endswith('.html'):
            document_id = document_id.replace('.html', '')

        # Construct the full URLs unless in local-only mode
        if asset_id == "local" and rml_id == "local":
            result[section_name] = f"Local section found: {document_id}"
        else:
            result[section_name] = f"{QUACKO_HTML_PAGES}/{asset_id}/documents/{document_id}.html"
            result[f"{section_name}_js"] = f"{QUACKO_BASE_URL}/{rml_id}/dossier-view/{asset_id}/{document_id}"

        return result

    except Exception as e: # Catch potential errors during processing
        logging.error(f"QUACKO.search() - extract_section_links(): Error processing the validated link tag for '{section_name}': {str(e)}")
        return {}