cosmoguard-bd/old/_old/find.py
2025-11-15 16:02:37 +01:00

497 lines
No EOL
20 KiB
Python

import requests
import urllib.parse
import json
import logging
import re
from datetime import datetime
from bs4 import BeautifulSoup
from typing import Dict, Union, Optional, Any
# Settings per il logging
logging.basicConfig(
format="{asctime} - {levelname} - {message}",
style="{",
datefmt="%Y-%m-%d %H:%M",
filename=".log",
encoding="utf-8",
filemode="a",
level=logging.INFO,
)
# Constants for API endpoints
QUACKO_BASE_URL = "https://chem.echa.europa.eu"
QUACKO_SUBSTANCE_API = f"{QUACKO_BASE_URL}/api-substance/v1/substance"
QUACKO_DOSSIER_API = f"{QUACKO_BASE_URL}/api-dossier-list/v1/dossier"
QUACKO_HTML_PAGES = f"{QUACKO_BASE_URL}/html-pages"
# Default sections to look for in the dossier
DEFAULT_SECTIONS = {
"id_7_Toxicologicalinformation": "ToxSummary",
"id_72_AcuteToxicity": "AcuteToxicity",
"id_75_Repeateddosetoxicity": "RepeatedDose",
"id_6_Ecotoxicologicalinformation": "EcotoxSummary",
"id_76_Genetictoxicity" : 'GeneticToxicity',
"id_42_Meltingpointfreezingpoint" : "MeltingFreezingPoint",
"id_43_Boilingpoint" : "BoilingPoint",
"id_48_Watersolubility" : "WaterSolubility",
"id_410_Surfacetension" : "SurfaceTension",
"id_420_pH" : "pH",
"Test" : "Test2"
}
def search_dossier(
substance: str,
input_type: str = 'rmlCas',
sections: Dict[str, str] = None,
local_index_path: str = None
) -> Union[Dict[str, Any], str, bool]:
"""
Search for a chemical substance in the QUACKO database and retrieve its dossier information.
Args:
substance (str): The identifier of the substance to search for (e.g. CAS number, name)
input_type (str): The type of identifier provided. Options: 'rmlCas', 'rmlName', 'rmlEc'
sections (Dict[str, str], optional): Dictionary mapping section IDs to result keys.
If None, default sections will be used.
local_index_path (str, optional): Path to a local index.html file to parse instead of
downloading from QUACKO. If provided, the function will
skip all API calls and only extract sections from the local file.
Returns:
Union[Dict[str, Any], str, bool]: Dictionary with substance information and dossier links on success,
error message string if substance found but with issues,
False if substance not found or other critical error
"""
# Use default sections if none provided
if sections is None:
sections = DEFAULT_SECTIONS
try:
results = {}
# If a local file is provided, extract sections from it directly
if local_index_path:
logging.info(f"QUACKO.search() - Using local index file: {local_index_path}")
# We still need some minimal info for constructing the URLs
if '/' not in local_index_path:
asset_id = "local"
rml_id = "local"
else:
# Try to extract information from the path if available
path_parts = local_index_path.split('/')
# If path follows expected structure: .../html-pages/ASSET_ID/index.html
if 'html-pages' in path_parts and 'index.html' in path_parts[-1]:
asset_id = path_parts[path_parts.index('html-pages') + 1]
rml_id = "extracted" # Just a placeholder
else:
asset_id = "local"
rml_id = "local"
# Add these to results for consistency
results["assetExternalId"] = asset_id
results["rmlId"] = rml_id
# Extract sections from the local file
section_links = get_section_links_from_file(local_index_path, asset_id, rml_id, sections)
if section_links:
results.update(section_links)
return results
# Normal flow with API calls
substance_data = get_substance_by_identifier(substance)
if not substance_data:
return False
# Verify that the found substance matches the input identifier
if substance_data.get(input_type) != substance:
error_msg = (f"Search error: results[{input_type}] (\"{substance_data.get(input_type)}\") "
f"is not equal to \"{substance}\". Maybe you specified the wrong input_type. "
f"Check the results here: {substance_data.get('search_response')}")
logging.error(f"QUACKO.search(): {error_msg}")
return error_msg
# Step 2: Find dossiers for the substance
rml_id = substance_data["rmlId"]
dossier_data = get_dossier_by_rml_id(rml_id, substance)
if not dossier_data:
return False
# Merge substance and dossier data
results = {**substance_data, **dossier_data}
# Step 3: Extract detailed information from dossier index page
asset_external_id = dossier_data["assetExternalId"]
section_links = get_section_links_from_index(asset_external_id, rml_id, sections)
if section_links:
results.update(section_links)
logging.info(f"QUACKO.search() OK. output: {json.dumps(results)}")
return results
except Exception as e:
logging.error(f"QUACKO.search(): Unexpected error in search_dossier for '{substance}': {str(e)}")
return False
def get_substance_by_identifier(substance: str) -> Optional[Dict[str, str]]:
"""
Search the QUACKO database for a substance using the provided identifier.
Args:
substance (str): The substance identifier to search for (CAS number, name, etc.)
Returns:
Optional[Dict[str, str]]: Dictionary with substance information or None if not found
"""
encoded_substance = urllib.parse.quote(substance)
search_url = f"{QUACKO_SUBSTANCE_API}?pageIndex=1&pageSize=100&searchText={encoded_substance}"
logging.info(f'QUACKO.search(). searching "{substance}"')
try:
response = requests.get(search_url)
response.raise_for_status() # Raise exception for HTTP errors
data = response.json()
if not data.get("items") or len(data["items"]) == 0:
logging.info(f"QUACKO.search() could not find substance for '{substance}'")
return None
# Extract substance information
substance_index = data["items"][0]["substanceIndex"]
result = {
'search_response': search_url,
'rmlId': substance_index.get("rmlId", ""),
'rmlName': substance_index.get("rmlName", ""),
'rmlCas': substance_index.get("rmlCas", ""),
'rmlEc': substance_index.get("rmlEc", "")
}
logging.info(
f"QUACKO.search() found substance on QUACKO. "
f"rmlId: '{result['rmlId']}', rmlName: '{result['rmlName']}', rmlCas: '{result['rmlCas']}'"
)
return result
except requests.RequestException as e:
logging.error(f"QUACKO.search() - Request error while searching for substance '{substance}': {str(e)}")
return None
except (KeyError, IndexError) as e:
logging.error(f"QUACKO.search() - Data parsing error for substance '{substance}': {str(e)}")
return None
def get_dossier_by_rml_id(rml_id: str, substance_name: str) -> Optional[Dict[str, Any]]:
"""
Find dossiers for a substance using its RML ID.
Args:
rml_id (str): The RML ID of the substance
substance_name (str): The name of the substance (for logging)
Returns:
Optional[Dict[str, Any]]: Dictionary with dossier information or None if not found
"""
# First try active dossiers
dossier_results = _query_dossier_api(rml_id, "Active")
# If no active dossiers found, try inactive ones
if not dossier_results:
logging.info(
f"QUACKO.search() - could not find active dossier for '{substance_name}'. "
"Proceeding to search in the unactive ones."
)
dossier_results = _query_dossier_api(rml_id, "Inactive")
if not dossier_results:
logging.info(f"QUACKO.search() - could not find unactive dossiers for '{substance_name}'")
return None
else:
logging.info(f"QUACKO.search() - found unactive dossiers for '{substance_name}'")
dossier_results["dossierType"] = "Inactive"
else:
logging.info(f"QUACKO.search() - found active dossiers for '{substance_name}'")
dossier_results["dossierType"] = "Active"
return dossier_results
def _query_dossier_api(rml_id: str, status: str) -> Optional[Dict[str, Any]]:
"""
Helper function to query the QUACKO dossier API for a specific substance and status.
Args:
rml_id (str): The RML ID of the substance
status (str): The status of dossiers to search for ('Active' or 'Inactive')
Returns:
Optional[Dict[str, Any]]: Dictionary with dossier information or None if not found
"""
url = f"{QUACKO_DOSSIER_API}?pageIndex=1&pageSize=100&rmlId={rml_id}&registrationStatuses={status}"
try:
response = requests.get(url)
response.raise_for_status()
data = response.json()
if not data.get("items") or len(data["items"]) == 0:
return None
result = {
"assetExternalId": data["items"][0]["assetExternalId"],
"rootKey": data["items"][0]["rootKey"],
}
# Extract last update date if available
try:
last_update = data["items"][0]["lastUpdatedDate"]
datetime_object = datetime.fromisoformat(last_update.replace('Z', '+00:00'))
result['lastUpdateDate'] = datetime_object.date().isoformat()
except (KeyError, ValueError) as e:
logging.error(f"QUACKO.search() - Error extracting lastUpdateDate: {str(e)}")
# Add index URLs
result["index"] = f"{QUACKO_HTML_PAGES}/{result['assetExternalId']}/index.html"
result["index_js"] = f"{QUACKO_BASE_URL}/{rml_id}/dossier-view/{result['assetExternalId']}"
return result
except requests.RequestException as e:
logging.error(f"QUACKO.search() - Request error while getting dossiers for RML ID '{rml_id}': {str(e)}")
return None
except (KeyError, IndexError) as e:
logging.error(f"QUACKO.search() - Data parsing error for RML ID '{rml_id}': {str(e)}")
return None
def get_section_links_from_index(
asset_id: str,
rml_id: str,
sections: Dict[str, str]
) -> Dict[str, str]:
"""
Extract links to specified sections from the dossier index page by downloading it.
Args:
asset_id (str): The asset external ID of the dossier
rml_id (str): The RML ID of the substance
sections (Dict[str, str]): Dictionary mapping section IDs to result keys
Returns:
Dict[str, str]: Dictionary with links to the requested sections
"""
index_url = f"{QUACKO_HTML_PAGES}/{asset_id}/index.html"
try:
response = requests.get(index_url)
response.raise_for_status()
# Parse content using the shared method
return parse_sections_from_html(response.text, asset_id, rml_id, sections)
except requests.RequestException as e:
logging.error(f"QUACKO.search() - Request error while extracting section links: {str(e)}")
return {}
except Exception as e:
logging.error(f"QUACKO.search() - Error extracting section links: {str(e)}")
return {}
def get_section_links_from_file(
file_path: str,
asset_id: str,
rml_id: str,
sections: Dict[str, str]
) -> Dict[str, str]:
"""
Extract links to specified sections from a local index.html file.
Args:
file_path (str): Path to the local index.html file
asset_id (str): The asset external ID to use for constructing URLs
rml_id (str): The RML ID to use for constructing URLs
sections (Dict[str, str]): Dictionary mapping section IDs to result keys
Returns:
Dict[str, str]: Dictionary with links to the requested sections
"""
try:
if not os.path.exists(file_path):
logging.error(f"QUACKO.search() - Local file not found: {file_path}")
return {}
with open(file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
# Parse content using the shared method
return parse_sections_from_html(html_content, asset_id, rml_id, sections)
except FileNotFoundError:
logging.error(f"QUACKO.search() - File not found: {file_path}")
return {}
except Exception as e:
logging.error(f"QUACKO.search() - Error parsing local file {file_path}: {str(e)}")
return {}
def parse_sections_from_html(
html_content: str,
asset_id: str,
rml_id: str,
sections: Dict[str, str]
) -> Dict[str, str]:
"""
Parse HTML content to extract links to specified sections.
Args:
html_content (str): HTML content to parse
asset_id (str): The asset external ID to use for constructing URLs
rml_id (str): The RML ID to use for constructing URLs
sections (Dict[str, str]): Dictionary mapping section IDs to result keys
Returns:
Dict[str, str]: Dictionary with links to the requested sections
"""
result = {}
try:
soup = BeautifulSoup(html_content, "html.parser")
# Extract each requested section
for section_id, section_name in sections.items():
section_links = extract_section_links(soup, section_id, asset_id, rml_id, section_name)
if section_links:
result.update(section_links)
logging.info(f"QUACKO.search() - Found section '{section_name}' in document")
else:
logging.info(f"QUACKO.search() - Section '{section_name}' not found in document")
return result
except Exception as e:
logging.error(f"QUACKO.search() - Error parsing HTML content: {str(e)}")
return {}
# --------------------------------------------------------------------------
# Function to Extract Section Links with Validation
# --------------------------------------------------------------------------
# This function extracts the document link associated with a specific section ID
# from the QUACKO index.html page structure.
#
# Problem Solved:
# Previous attempts faced issues where searching for a link within a parent
# section's div (e.g., "7 Toxicological Information" with id="id_7_...")
# would incorrectly grab the link belonging to the *first child* section
# (e.g., "7.2 Acute Toxicity" with id="id_72_..."). This happened because
# the simple `find("a", href=True)` doesn't distinguish ownership when nested.
#
# Solution Logic:
# 1. Find Target Div: Locate the `div` element using the specific `section_id` provided.
# This div typically contains the section's content or nested subsections.
# 2. Find First Link: Find the very first `<a>` tag that has an `href` attribute
# somewhere *inside* the `target_div`.
# 3. Find Link's Owning Section Div: Starting from the `first_link_tag`, traverse
# up the HTML tree using `find_parent()` to find the nearest ancestor `div`
# whose `id` attribute starts with "id_" (the pattern for section containers).
# 4. Validate Ownership: Compare the `id` of the `link_ancestor_section_div` found
# in step 3 with the original `section_id` passed into the function.
# 5. Decision:
# - If the IDs MATCH: It confirms that the `first_link_tag` truly belongs to the
# `section_id` we are querying. The function proceeds to extract and format
# this link.
# - If the IDs DO NOT MATCH: It indicates that the first link found actually
# belongs to a *nested* subsection div. Therefore, the original `section_id`
# (the parent/container) does not have its own direct link, and the function
# correctly returns an empty dictionary for this `section_id`.
#
# This validation step ensures that we only return links that are directly
# associated with the queried section ID, preventing the inheritance bug.
# --------------------------------------------------------------------------
def extract_section_links(
soup: BeautifulSoup,
section_id: str,
asset_id: str,
rml_id: str,
section_name: str
) -> Dict[str, str]:
"""
Extracts a link for a specific section ID by finding the first link
within its div and verifying that the link belongs directly to that
section, not a nested subsection.
Args:
soup (BeautifulSoup): The BeautifulSoup object of the index page.
section_id (str): The HTML ID of the section div.
asset_id (str): The asset external ID of the dossier.
rml_id (str): The RML ID of the substance.
section_name (str): The name to use for the section in the result.
Returns:
Dict[str, str]: Dictionary with link if found and validated,
otherwise empty.
"""
result = {}
# 1. Find the target div for the section ID
target_div = soup.find("div", id=section_id)
if not target_div:
logging.info(f"QUACKO.search() - extract_section_links(): No div found for id='{section_id}'")
return result
# 2. Find the first <a> tag with an href within this target div
first_link_tag = target_div.find("a", href=True)
if not first_link_tag:
logging.info(f"QUACKO.search() - extract_section_links(): No 'a' tag with href found within div id='{section_id}'")
return result # No links at all within this section
# 3. Validate: Find the closest ancestor div with an ID starting with "id_"
# This tells us which section container the link *actually* resides in.
# We use a lambda function for the id check.
# Need to handle potential None if the structure is unexpected.
link_ancestor_section_div: Optional[Tag] = first_link_tag.find_parent(
"div", id=lambda x: x and x.startswith("id_")
)
# 4. Compare IDs
if link_ancestor_section_div and link_ancestor_section_div.get('id') == section_id:
# The first link found belongs directly to the section we are looking for.
logging.debug(f"QUACKO.search() - extract_section_links(): Valid link found for id='{section_id}'.")
a_tag_to_use = first_link_tag # Use the link we found
else:
# The first link found belongs to a *different* (nested) section
# or the structure is broken (no ancestor div with id found).
# Therefore, the section_id we were originally checking has no direct link.
ancestor_id = link_ancestor_section_div.get('id') if link_ancestor_section_div else "None"
logging.info(f"QUACKO.search() - extract_section_links(): First link within id='{section_id}' belongs to ancestor id='{ancestor_id}'. No direct link for '{section_id}'.")
return result # Return empty dict
# 5. Proceed with link extraction using the validated a_tag_to_use
try:
document_id = a_tag_to_use.get('href') # Use .get() for safety
if not document_id:
logging.error(f"QUACKO.search() - extract_section_links(): Found 'a' tag for '{section_name}' has no href attribute.")
return {}
# Clean up the document ID
if document_id.startswith('./documents/'):
document_id = document_id.replace('./documents/', '')
if document_id.endswith('.html'):
document_id = document_id.replace('.html', '')
# Construct the full URLs unless in local-only mode
if asset_id == "local" and rml_id == "local":
result[section_name] = f"Local section found: {document_id}"
else:
result[section_name] = f"{QUACKO_HTML_PAGES}/{asset_id}/documents/{document_id}.html"
result[f"{section_name}_js"] = f"{QUACKO_BASE_URL}/{rml_id}/dossier-view/{asset_id}/{document_id}"
return result
except Exception as e: # Catch potential errors during processing
logging.error(f"QUACKO.search() - extract_section_links(): Error processing the validated link tag for '{section_name}': {str(e)}")
return {}