import requests import urllib.parse import json import logging import re from datetime import datetime from bs4 import BeautifulSoup from typing import Dict, Union, Optional, Any # Settings per il logging logging.basicConfig( format="{asctime} - {levelname} - {message}", style="{", datefmt="%Y-%m-%d %H:%M", filename=".log", encoding="utf-8", filemode="a", level=logging.INFO, ) # Constants for API endpoints QUACKO_BASE_URL = "https://chem.echa.europa.eu" QUACKO_SUBSTANCE_API = f"{QUACKO_BASE_URL}/api-substance/v1/substance" QUACKO_DOSSIER_API = f"{QUACKO_BASE_URL}/api-dossier-list/v1/dossier" QUACKO_HTML_PAGES = f"{QUACKO_BASE_URL}/html-pages" # Default sections to look for in the dossier DEFAULT_SECTIONS = { "id_7_Toxicologicalinformation": "ToxSummary", "id_72_AcuteToxicity": "AcuteToxicity", "id_75_Repeateddosetoxicity": "RepeatedDose", "id_6_Ecotoxicologicalinformation": "EcotoxSummary", "id_76_Genetictoxicity" : 'GeneticToxicity', "id_42_Meltingpointfreezingpoint" : "MeltingFreezingPoint", "id_43_Boilingpoint" : "BoilingPoint", "id_48_Watersolubility" : "WaterSolubility", "id_410_Surfacetension" : "SurfaceTension", "id_420_pH" : "pH", "Test" : "Test2" } def search_dossier( substance: str, input_type: str = 'rmlCas', sections: Dict[str, str] = None, local_index_path: str = None ) -> Union[Dict[str, Any], str, bool]: """ Search for a chemical substance in the QUACKO database and retrieve its dossier information. Args: substance (str): The identifier of the substance to search for (e.g. CAS number, name) input_type (str): The type of identifier provided. Options: 'rmlCas', 'rmlName', 'rmlEc' sections (Dict[str, str], optional): Dictionary mapping section IDs to result keys. If None, default sections will be used. local_index_path (str, optional): Path to a local index.html file to parse instead of downloading from QUACKO. If provided, the function will skip all API calls and only extract sections from the local file. Returns: Union[Dict[str, Any], str, bool]: Dictionary with substance information and dossier links on success, error message string if substance found but with issues, False if substance not found or other critical error """ # Use default sections if none provided if sections is None: sections = DEFAULT_SECTIONS try: results = {} # If a local file is provided, extract sections from it directly if local_index_path: logging.info(f"QUACKO.search() - Using local index file: {local_index_path}") # We still need some minimal info for constructing the URLs if '/' not in local_index_path: asset_id = "local" rml_id = "local" else: # Try to extract information from the path if available path_parts = local_index_path.split('/') # If path follows expected structure: .../html-pages/ASSET_ID/index.html if 'html-pages' in path_parts and 'index.html' in path_parts[-1]: asset_id = path_parts[path_parts.index('html-pages') + 1] rml_id = "extracted" # Just a placeholder else: asset_id = "local" rml_id = "local" # Add these to results for consistency results["assetExternalId"] = asset_id results["rmlId"] = rml_id # Extract sections from the local file section_links = get_section_links_from_file(local_index_path, asset_id, rml_id, sections) if section_links: results.update(section_links) return results # Normal flow with API calls substance_data = get_substance_by_identifier(substance) if not substance_data: return False # Verify that the found substance matches the input identifier if substance_data.get(input_type) != substance: error_msg = (f"Search error: results[{input_type}] (\"{substance_data.get(input_type)}\") " f"is not equal to \"{substance}\". Maybe you specified the wrong input_type. " f"Check the results here: {substance_data.get('search_response')}") logging.error(f"QUACKO.search(): {error_msg}") return error_msg # Step 2: Find dossiers for the substance rml_id = substance_data["rmlId"] dossier_data = get_dossier_by_rml_id(rml_id, substance) if not dossier_data: return False # Merge substance and dossier data results = {**substance_data, **dossier_data} # Step 3: Extract detailed information from dossier index page asset_external_id = dossier_data["assetExternalId"] section_links = get_section_links_from_index(asset_external_id, rml_id, sections) if section_links: results.update(section_links) logging.info(f"QUACKO.search() OK. output: {json.dumps(results)}") return results except Exception as e: logging.error(f"QUACKO.search(): Unexpected error in search_dossier for '{substance}': {str(e)}") return False def get_substance_by_identifier(substance: str) -> Optional[Dict[str, str]]: """ Search the QUACKO database for a substance using the provided identifier. Args: substance (str): The substance identifier to search for (CAS number, name, etc.) Returns: Optional[Dict[str, str]]: Dictionary with substance information or None if not found """ encoded_substance = urllib.parse.quote(substance) search_url = f"{QUACKO_SUBSTANCE_API}?pageIndex=1&pageSize=100&searchText={encoded_substance}" logging.info(f'QUACKO.search(). searching "{substance}"') try: response = requests.get(search_url) response.raise_for_status() # Raise exception for HTTP errors data = response.json() if not data.get("items") or len(data["items"]) == 0: logging.info(f"QUACKO.search() could not find substance for '{substance}'") return None # Extract substance information substance_index = data["items"][0]["substanceIndex"] result = { 'search_response': search_url, 'rmlId': substance_index.get("rmlId", ""), 'rmlName': substance_index.get("rmlName", ""), 'rmlCas': substance_index.get("rmlCas", ""), 'rmlEc': substance_index.get("rmlEc", "") } logging.info( f"QUACKO.search() found substance on QUACKO. " f"rmlId: '{result['rmlId']}', rmlName: '{result['rmlName']}', rmlCas: '{result['rmlCas']}'" ) return result except requests.RequestException as e: logging.error(f"QUACKO.search() - Request error while searching for substance '{substance}': {str(e)}") return None except (KeyError, IndexError) as e: logging.error(f"QUACKO.search() - Data parsing error for substance '{substance}': {str(e)}") return None def get_dossier_by_rml_id(rml_id: str, substance_name: str) -> Optional[Dict[str, Any]]: """ Find dossiers for a substance using its RML ID. Args: rml_id (str): The RML ID of the substance substance_name (str): The name of the substance (for logging) Returns: Optional[Dict[str, Any]]: Dictionary with dossier information or None if not found """ # First try active dossiers dossier_results = _query_dossier_api(rml_id, "Active") # If no active dossiers found, try inactive ones if not dossier_results: logging.info( f"QUACKO.search() - could not find active dossier for '{substance_name}'. " "Proceeding to search in the unactive ones." ) dossier_results = _query_dossier_api(rml_id, "Inactive") if not dossier_results: logging.info(f"QUACKO.search() - could not find unactive dossiers for '{substance_name}'") return None else: logging.info(f"QUACKO.search() - found unactive dossiers for '{substance_name}'") dossier_results["dossierType"] = "Inactive" else: logging.info(f"QUACKO.search() - found active dossiers for '{substance_name}'") dossier_results["dossierType"] = "Active" return dossier_results def _query_dossier_api(rml_id: str, status: str) -> Optional[Dict[str, Any]]: """ Helper function to query the QUACKO dossier API for a specific substance and status. Args: rml_id (str): The RML ID of the substance status (str): The status of dossiers to search for ('Active' or 'Inactive') Returns: Optional[Dict[str, Any]]: Dictionary with dossier information or None if not found """ url = f"{QUACKO_DOSSIER_API}?pageIndex=1&pageSize=100&rmlId={rml_id}®istrationStatuses={status}" try: response = requests.get(url) response.raise_for_status() data = response.json() if not data.get("items") or len(data["items"]) == 0: return None result = { "assetExternalId": data["items"][0]["assetExternalId"], "rootKey": data["items"][0]["rootKey"], } # Extract last update date if available try: last_update = data["items"][0]["lastUpdatedDate"] datetime_object = datetime.fromisoformat(last_update.replace('Z', '+00:00')) result['lastUpdateDate'] = datetime_object.date().isoformat() except (KeyError, ValueError) as e: logging.error(f"QUACKO.search() - Error extracting lastUpdateDate: {str(e)}") # Add index URLs result["index"] = f"{QUACKO_HTML_PAGES}/{result['assetExternalId']}/index.html" result["index_js"] = f"{QUACKO_BASE_URL}/{rml_id}/dossier-view/{result['assetExternalId']}" return result except requests.RequestException as e: logging.error(f"QUACKO.search() - Request error while getting dossiers for RML ID '{rml_id}': {str(e)}") return None except (KeyError, IndexError) as e: logging.error(f"QUACKO.search() - Data parsing error for RML ID '{rml_id}': {str(e)}") return None def get_section_links_from_index( asset_id: str, rml_id: str, sections: Dict[str, str] ) -> Dict[str, str]: """ Extract links to specified sections from the dossier index page by downloading it. Args: asset_id (str): The asset external ID of the dossier rml_id (str): The RML ID of the substance sections (Dict[str, str]): Dictionary mapping section IDs to result keys Returns: Dict[str, str]: Dictionary with links to the requested sections """ index_url = f"{QUACKO_HTML_PAGES}/{asset_id}/index.html" try: response = requests.get(index_url) response.raise_for_status() # Parse content using the shared method return parse_sections_from_html(response.text, asset_id, rml_id, sections) except requests.RequestException as e: logging.error(f"QUACKO.search() - Request error while extracting section links: {str(e)}") return {} except Exception as e: logging.error(f"QUACKO.search() - Error extracting section links: {str(e)}") return {} def get_section_links_from_file( file_path: str, asset_id: str, rml_id: str, sections: Dict[str, str] ) -> Dict[str, str]: """ Extract links to specified sections from a local index.html file. Args: file_path (str): Path to the local index.html file asset_id (str): The asset external ID to use for constructing URLs rml_id (str): The RML ID to use for constructing URLs sections (Dict[str, str]): Dictionary mapping section IDs to result keys Returns: Dict[str, str]: Dictionary with links to the requested sections """ try: if not os.path.exists(file_path): logging.error(f"QUACKO.search() - Local file not found: {file_path}") return {} with open(file_path, 'r', encoding='utf-8') as file: html_content = file.read() # Parse content using the shared method return parse_sections_from_html(html_content, asset_id, rml_id, sections) except FileNotFoundError: logging.error(f"QUACKO.search() - File not found: {file_path}") return {} except Exception as e: logging.error(f"QUACKO.search() - Error parsing local file {file_path}: {str(e)}") return {} def parse_sections_from_html( html_content: str, asset_id: str, rml_id: str, sections: Dict[str, str] ) -> Dict[str, str]: """ Parse HTML content to extract links to specified sections. Args: html_content (str): HTML content to parse asset_id (str): The asset external ID to use for constructing URLs rml_id (str): The RML ID to use for constructing URLs sections (Dict[str, str]): Dictionary mapping section IDs to result keys Returns: Dict[str, str]: Dictionary with links to the requested sections """ result = {} try: soup = BeautifulSoup(html_content, "html.parser") # Extract each requested section for section_id, section_name in sections.items(): section_links = extract_section_links(soup, section_id, asset_id, rml_id, section_name) if section_links: result.update(section_links) logging.info(f"QUACKO.search() - Found section '{section_name}' in document") else: logging.info(f"QUACKO.search() - Section '{section_name}' not found in document") return result except Exception as e: logging.error(f"QUACKO.search() - Error parsing HTML content: {str(e)}") return {} # -------------------------------------------------------------------------- # Function to Extract Section Links with Validation # -------------------------------------------------------------------------- # This function extracts the document link associated with a specific section ID # from the QUACKO index.html page structure. # # Problem Solved: # Previous attempts faced issues where searching for a link within a parent # section's div (e.g., "7 Toxicological Information" with id="id_7_...") # would incorrectly grab the link belonging to the *first child* section # (e.g., "7.2 Acute Toxicity" with id="id_72_..."). This happened because # the simple `find("a", href=True)` doesn't distinguish ownership when nested. # # Solution Logic: # 1. Find Target Div: Locate the `div` element using the specific `section_id` provided. # This div typically contains the section's content or nested subsections. # 2. Find First Link: Find the very first `` tag that has an `href` attribute # somewhere *inside* the `target_div`. # 3. Find Link's Owning Section Div: Starting from the `first_link_tag`, traverse # up the HTML tree using `find_parent()` to find the nearest ancestor `div` # whose `id` attribute starts with "id_" (the pattern for section containers). # 4. Validate Ownership: Compare the `id` of the `link_ancestor_section_div` found # in step 3 with the original `section_id` passed into the function. # 5. Decision: # - If the IDs MATCH: It confirms that the `first_link_tag` truly belongs to the # `section_id` we are querying. The function proceeds to extract and format # this link. # - If the IDs DO NOT MATCH: It indicates that the first link found actually # belongs to a *nested* subsection div. Therefore, the original `section_id` # (the parent/container) does not have its own direct link, and the function # correctly returns an empty dictionary for this `section_id`. # # This validation step ensures that we only return links that are directly # associated with the queried section ID, preventing the inheritance bug. # -------------------------------------------------------------------------- def extract_section_links( soup: BeautifulSoup, section_id: str, asset_id: str, rml_id: str, section_name: str ) -> Dict[str, str]: """ Extracts a link for a specific section ID by finding the first link within its div and verifying that the link belongs directly to that section, not a nested subsection. Args: soup (BeautifulSoup): The BeautifulSoup object of the index page. section_id (str): The HTML ID of the section div. asset_id (str): The asset external ID of the dossier. rml_id (str): The RML ID of the substance. section_name (str): The name to use for the section in the result. Returns: Dict[str, str]: Dictionary with link if found and validated, otherwise empty. """ result = {} # 1. Find the target div for the section ID target_div = soup.find("div", id=section_id) if not target_div: logging.info(f"QUACKO.search() - extract_section_links(): No div found for id='{section_id}'") return result # 2. Find the first tag with an href within this target div first_link_tag = target_div.find("a", href=True) if not first_link_tag: logging.info(f"QUACKO.search() - extract_section_links(): No 'a' tag with href found within div id='{section_id}'") return result # No links at all within this section # 3. Validate: Find the closest ancestor div with an ID starting with "id_" # This tells us which section container the link *actually* resides in. # We use a lambda function for the id check. # Need to handle potential None if the structure is unexpected. link_ancestor_section_div: Optional[Tag] = first_link_tag.find_parent( "div", id=lambda x: x and x.startswith("id_") ) # 4. Compare IDs if link_ancestor_section_div and link_ancestor_section_div.get('id') == section_id: # The first link found belongs directly to the section we are looking for. logging.debug(f"QUACKO.search() - extract_section_links(): Valid link found for id='{section_id}'.") a_tag_to_use = first_link_tag # Use the link we found else: # The first link found belongs to a *different* (nested) section # or the structure is broken (no ancestor div with id found). # Therefore, the section_id we were originally checking has no direct link. ancestor_id = link_ancestor_section_div.get('id') if link_ancestor_section_div else "None" logging.info(f"QUACKO.search() - extract_section_links(): First link within id='{section_id}' belongs to ancestor id='{ancestor_id}'. No direct link for '{section_id}'.") return result # Return empty dict # 5. Proceed with link extraction using the validated a_tag_to_use try: document_id = a_tag_to_use.get('href') # Use .get() for safety if not document_id: logging.error(f"QUACKO.search() - extract_section_links(): Found 'a' tag for '{section_name}' has no href attribute.") return {} # Clean up the document ID if document_id.startswith('./documents/'): document_id = document_id.replace('./documents/', '') if document_id.endswith('.html'): document_id = document_id.replace('.html', '') # Construct the full URLs unless in local-only mode if asset_id == "local" and rml_id == "local": result[section_name] = f"Local section found: {document_id}" else: result[section_name] = f"{QUACKO_HTML_PAGES}/{asset_id}/documents/{document_id}.html" result[f"{section_name}_js"] = f"{QUACKO_BASE_URL}/{rml_id}/dossier-view/{asset_id}/{document_id}" return result except Exception as e: # Catch potential errors during processing logging.error(f"QUACKO.search() - extract_section_links(): Error processing the validated link tag for '{section_name}': {str(e)}") return {}