import requests
import urllib.parse
import json
import logging
import re
from datetime import datetime
from bs4 import BeautifulSoup
from typing import Dict, Union, Optional, Any
# Settings per il logging
logging.basicConfig(
format="{asctime} - {levelname} - {message}",
style="{",
datefmt="%Y-%m-%d %H:%M",
filename=".log",
encoding="utf-8",
filemode="a",
level=logging.INFO,
)
# Constants for API endpoints
QUACKO_BASE_URL = "https://chem.echa.europa.eu"
QUACKO_SUBSTANCE_API = f"{QUACKO_BASE_URL}/api-substance/v1/substance"
QUACKO_DOSSIER_API = f"{QUACKO_BASE_URL}/api-dossier-list/v1/dossier"
QUACKO_HTML_PAGES = f"{QUACKO_BASE_URL}/html-pages"
# Default sections to look for in the dossier
DEFAULT_SECTIONS = {
"id_7_Toxicologicalinformation": "ToxSummary",
"id_72_AcuteToxicity": "AcuteToxicity",
"id_75_Repeateddosetoxicity": "RepeatedDose",
"id_6_Ecotoxicologicalinformation": "EcotoxSummary",
"id_76_Genetictoxicity" : 'GeneticToxicity',
"id_42_Meltingpointfreezingpoint" : "MeltingFreezingPoint",
"id_43_Boilingpoint" : "BoilingPoint",
"id_48_Watersolubility" : "WaterSolubility",
"id_410_Surfacetension" : "SurfaceTension",
"id_420_pH" : "pH",
"Test" : "Test2"
}
def search_dossier(
substance: str,
input_type: str = 'rmlCas',
sections: Dict[str, str] = None,
local_index_path: str = None
) -> Union[Dict[str, Any], str, bool]:
"""
Search for a chemical substance in the QUACKO database and retrieve its dossier information.
Args:
substance (str): The identifier of the substance to search for (e.g. CAS number, name)
input_type (str): The type of identifier provided. Options: 'rmlCas', 'rmlName', 'rmlEc'
sections (Dict[str, str], optional): Dictionary mapping section IDs to result keys.
If None, default sections will be used.
local_index_path (str, optional): Path to a local index.html file to parse instead of
downloading from QUACKO. If provided, the function will
skip all API calls and only extract sections from the local file.
Returns:
Union[Dict[str, Any], str, bool]: Dictionary with substance information and dossier links on success,
error message string if substance found but with issues,
False if substance not found or other critical error
"""
# Use default sections if none provided
if sections is None:
sections = DEFAULT_SECTIONS
try:
results = {}
# If a local file is provided, extract sections from it directly
if local_index_path:
logging.info(f"QUACKO.search() - Using local index file: {local_index_path}")
# We still need some minimal info for constructing the URLs
if '/' not in local_index_path:
asset_id = "local"
rml_id = "local"
else:
# Try to extract information from the path if available
path_parts = local_index_path.split('/')
# If path follows expected structure: .../html-pages/ASSET_ID/index.html
if 'html-pages' in path_parts and 'index.html' in path_parts[-1]:
asset_id = path_parts[path_parts.index('html-pages') + 1]
rml_id = "extracted" # Just a placeholder
else:
asset_id = "local"
rml_id = "local"
# Add these to results for consistency
results["assetExternalId"] = asset_id
results["rmlId"] = rml_id
# Extract sections from the local file
section_links = get_section_links_from_file(local_index_path, asset_id, rml_id, sections)
if section_links:
results.update(section_links)
return results
# Normal flow with API calls
substance_data = get_substance_by_identifier(substance)
if not substance_data:
return False
# Verify that the found substance matches the input identifier
if substance_data.get(input_type) != substance:
error_msg = (f"Search error: results[{input_type}] (\"{substance_data.get(input_type)}\") "
f"is not equal to \"{substance}\". Maybe you specified the wrong input_type. "
f"Check the results here: {substance_data.get('search_response')}")
logging.error(f"QUACKO.search(): {error_msg}")
return error_msg
# Step 2: Find dossiers for the substance
rml_id = substance_data["rmlId"]
dossier_data = get_dossier_by_rml_id(rml_id, substance)
if not dossier_data:
return False
# Merge substance and dossier data
results = {**substance_data, **dossier_data}
# Step 3: Extract detailed information from dossier index page
asset_external_id = dossier_data["assetExternalId"]
section_links = get_section_links_from_index(asset_external_id, rml_id, sections)
if section_links:
results.update(section_links)
logging.info(f"QUACKO.search() OK. output: {json.dumps(results)}")
return results
except Exception as e:
logging.error(f"QUACKO.search(): Unexpected error in search_dossier for '{substance}': {str(e)}")
return False
def get_substance_by_identifier(substance: str) -> Optional[Dict[str, str]]:
"""
Search the QUACKO database for a substance using the provided identifier.
Args:
substance (str): The substance identifier to search for (CAS number, name, etc.)
Returns:
Optional[Dict[str, str]]: Dictionary with substance information or None if not found
"""
encoded_substance = urllib.parse.quote(substance)
search_url = f"{QUACKO_SUBSTANCE_API}?pageIndex=1&pageSize=100&searchText={encoded_substance}"
logging.info(f'QUACKO.search(). searching "{substance}"')
try:
response = requests.get(search_url)
response.raise_for_status() # Raise exception for HTTP errors
data = response.json()
if not data.get("items") or len(data["items"]) == 0:
logging.info(f"QUACKO.search() could not find substance for '{substance}'")
return None
# Extract substance information
substance_index = data["items"][0]["substanceIndex"]
result = {
'search_response': search_url,
'rmlId': substance_index.get("rmlId", ""),
'rmlName': substance_index.get("rmlName", ""),
'rmlCas': substance_index.get("rmlCas", ""),
'rmlEc': substance_index.get("rmlEc", "")
}
logging.info(
f"QUACKO.search() found substance on QUACKO. "
f"rmlId: '{result['rmlId']}', rmlName: '{result['rmlName']}', rmlCas: '{result['rmlCas']}'"
)
return result
except requests.RequestException as e:
logging.error(f"QUACKO.search() - Request error while searching for substance '{substance}': {str(e)}")
return None
except (KeyError, IndexError) as e:
logging.error(f"QUACKO.search() - Data parsing error for substance '{substance}': {str(e)}")
return None
def get_dossier_by_rml_id(rml_id: str, substance_name: str) -> Optional[Dict[str, Any]]:
"""
Find dossiers for a substance using its RML ID.
Args:
rml_id (str): The RML ID of the substance
substance_name (str): The name of the substance (for logging)
Returns:
Optional[Dict[str, Any]]: Dictionary with dossier information or None if not found
"""
# First try active dossiers
dossier_results = _query_dossier_api(rml_id, "Active")
# If no active dossiers found, try inactive ones
if not dossier_results:
logging.info(
f"QUACKO.search() - could not find active dossier for '{substance_name}'. "
"Proceeding to search in the unactive ones."
)
dossier_results = _query_dossier_api(rml_id, "Inactive")
if not dossier_results:
logging.info(f"QUACKO.search() - could not find unactive dossiers for '{substance_name}'")
return None
else:
logging.info(f"QUACKO.search() - found unactive dossiers for '{substance_name}'")
dossier_results["dossierType"] = "Inactive"
else:
logging.info(f"QUACKO.search() - found active dossiers for '{substance_name}'")
dossier_results["dossierType"] = "Active"
return dossier_results
def _query_dossier_api(rml_id: str, status: str) -> Optional[Dict[str, Any]]:
"""
Helper function to query the QUACKO dossier API for a specific substance and status.
Args:
rml_id (str): The RML ID of the substance
status (str): The status of dossiers to search for ('Active' or 'Inactive')
Returns:
Optional[Dict[str, Any]]: Dictionary with dossier information or None if not found
"""
url = f"{QUACKO_DOSSIER_API}?pageIndex=1&pageSize=100&rmlId={rml_id}®istrationStatuses={status}"
try:
response = requests.get(url)
response.raise_for_status()
data = response.json()
if not data.get("items") or len(data["items"]) == 0:
return None
result = {
"assetExternalId": data["items"][0]["assetExternalId"],
"rootKey": data["items"][0]["rootKey"],
}
# Extract last update date if available
try:
last_update = data["items"][0]["lastUpdatedDate"]
datetime_object = datetime.fromisoformat(last_update.replace('Z', '+00:00'))
result['lastUpdateDate'] = datetime_object.date().isoformat()
except (KeyError, ValueError) as e:
logging.error(f"QUACKO.search() - Error extracting lastUpdateDate: {str(e)}")
# Add index URLs
result["index"] = f"{QUACKO_HTML_PAGES}/{result['assetExternalId']}/index.html"
result["index_js"] = f"{QUACKO_BASE_URL}/{rml_id}/dossier-view/{result['assetExternalId']}"
return result
except requests.RequestException as e:
logging.error(f"QUACKO.search() - Request error while getting dossiers for RML ID '{rml_id}': {str(e)}")
return None
except (KeyError, IndexError) as e:
logging.error(f"QUACKO.search() - Data parsing error for RML ID '{rml_id}': {str(e)}")
return None
def get_section_links_from_index(
asset_id: str,
rml_id: str,
sections: Dict[str, str]
) -> Dict[str, str]:
"""
Extract links to specified sections from the dossier index page by downloading it.
Args:
asset_id (str): The asset external ID of the dossier
rml_id (str): The RML ID of the substance
sections (Dict[str, str]): Dictionary mapping section IDs to result keys
Returns:
Dict[str, str]: Dictionary with links to the requested sections
"""
index_url = f"{QUACKO_HTML_PAGES}/{asset_id}/index.html"
try:
response = requests.get(index_url)
response.raise_for_status()
# Parse content using the shared method
return parse_sections_from_html(response.text, asset_id, rml_id, sections)
except requests.RequestException as e:
logging.error(f"QUACKO.search() - Request error while extracting section links: {str(e)}")
return {}
except Exception as e:
logging.error(f"QUACKO.search() - Error extracting section links: {str(e)}")
return {}
def get_section_links_from_file(
file_path: str,
asset_id: str,
rml_id: str,
sections: Dict[str, str]
) -> Dict[str, str]:
"""
Extract links to specified sections from a local index.html file.
Args:
file_path (str): Path to the local index.html file
asset_id (str): The asset external ID to use for constructing URLs
rml_id (str): The RML ID to use for constructing URLs
sections (Dict[str, str]): Dictionary mapping section IDs to result keys
Returns:
Dict[str, str]: Dictionary with links to the requested sections
"""
try:
if not os.path.exists(file_path):
logging.error(f"QUACKO.search() - Local file not found: {file_path}")
return {}
with open(file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
# Parse content using the shared method
return parse_sections_from_html(html_content, asset_id, rml_id, sections)
except FileNotFoundError:
logging.error(f"QUACKO.search() - File not found: {file_path}")
return {}
except Exception as e:
logging.error(f"QUACKO.search() - Error parsing local file {file_path}: {str(e)}")
return {}
def parse_sections_from_html(
html_content: str,
asset_id: str,
rml_id: str,
sections: Dict[str, str]
) -> Dict[str, str]:
"""
Parse HTML content to extract links to specified sections.
Args:
html_content (str): HTML content to parse
asset_id (str): The asset external ID to use for constructing URLs
rml_id (str): The RML ID to use for constructing URLs
sections (Dict[str, str]): Dictionary mapping section IDs to result keys
Returns:
Dict[str, str]: Dictionary with links to the requested sections
"""
result = {}
try:
soup = BeautifulSoup(html_content, "html.parser")
# Extract each requested section
for section_id, section_name in sections.items():
section_links = extract_section_links(soup, section_id, asset_id, rml_id, section_name)
if section_links:
result.update(section_links)
logging.info(f"QUACKO.search() - Found section '{section_name}' in document")
else:
logging.info(f"QUACKO.search() - Section '{section_name}' not found in document")
return result
except Exception as e:
logging.error(f"QUACKO.search() - Error parsing HTML content: {str(e)}")
return {}
# --------------------------------------------------------------------------
# Function to Extract Section Links with Validation
# --------------------------------------------------------------------------
# This function extracts the document link associated with a specific section ID
# from the QUACKO index.html page structure.
#
# Problem Solved:
# Previous attempts faced issues where searching for a link within a parent
# section's div (e.g., "7 Toxicological Information" with id="id_7_...")
# would incorrectly grab the link belonging to the *first child* section
# (e.g., "7.2 Acute Toxicity" with id="id_72_..."). This happened because
# the simple `find("a", href=True)` doesn't distinguish ownership when nested.
#
# Solution Logic:
# 1. Find Target Div: Locate the `div` element using the specific `section_id` provided.
# This div typically contains the section's content or nested subsections.
# 2. Find First Link: Find the very first `` tag that has an `href` attribute
# somewhere *inside* the `target_div`.
# 3. Find Link's Owning Section Div: Starting from the `first_link_tag`, traverse
# up the HTML tree using `find_parent()` to find the nearest ancestor `div`
# whose `id` attribute starts with "id_" (the pattern for section containers).
# 4. Validate Ownership: Compare the `id` of the `link_ancestor_section_div` found
# in step 3 with the original `section_id` passed into the function.
# 5. Decision:
# - If the IDs MATCH: It confirms that the `first_link_tag` truly belongs to the
# `section_id` we are querying. The function proceeds to extract and format
# this link.
# - If the IDs DO NOT MATCH: It indicates that the first link found actually
# belongs to a *nested* subsection div. Therefore, the original `section_id`
# (the parent/container) does not have its own direct link, and the function
# correctly returns an empty dictionary for this `section_id`.
#
# This validation step ensures that we only return links that are directly
# associated with the queried section ID, preventing the inheritance bug.
# --------------------------------------------------------------------------
def extract_section_links(
soup: BeautifulSoup,
section_id: str,
asset_id: str,
rml_id: str,
section_name: str
) -> Dict[str, str]:
"""
Extracts a link for a specific section ID by finding the first link
within its div and verifying that the link belongs directly to that
section, not a nested subsection.
Args:
soup (BeautifulSoup): The BeautifulSoup object of the index page.
section_id (str): The HTML ID of the section div.
asset_id (str): The asset external ID of the dossier.
rml_id (str): The RML ID of the substance.
section_name (str): The name to use for the section in the result.
Returns:
Dict[str, str]: Dictionary with link if found and validated,
otherwise empty.
"""
result = {}
# 1. Find the target div for the section ID
target_div = soup.find("div", id=section_id)
if not target_div:
logging.info(f"QUACKO.search() - extract_section_links(): No div found for id='{section_id}'")
return result
# 2. Find the first tag with an href within this target div
first_link_tag = target_div.find("a", href=True)
if not first_link_tag:
logging.info(f"QUACKO.search() - extract_section_links(): No 'a' tag with href found within div id='{section_id}'")
return result # No links at all within this section
# 3. Validate: Find the closest ancestor div with an ID starting with "id_"
# This tells us which section container the link *actually* resides in.
# We use a lambda function for the id check.
# Need to handle potential None if the structure is unexpected.
link_ancestor_section_div: Optional[Tag] = first_link_tag.find_parent(
"div", id=lambda x: x and x.startswith("id_")
)
# 4. Compare IDs
if link_ancestor_section_div and link_ancestor_section_div.get('id') == section_id:
# The first link found belongs directly to the section we are looking for.
logging.debug(f"QUACKO.search() - extract_section_links(): Valid link found for id='{section_id}'.")
a_tag_to_use = first_link_tag # Use the link we found
else:
# The first link found belongs to a *different* (nested) section
# or the structure is broken (no ancestor div with id found).
# Therefore, the section_id we were originally checking has no direct link.
ancestor_id = link_ancestor_section_div.get('id') if link_ancestor_section_div else "None"
logging.info(f"QUACKO.search() - extract_section_links(): First link within id='{section_id}' belongs to ancestor id='{ancestor_id}'. No direct link for '{section_id}'.")
return result # Return empty dict
# 5. Proceed with link extraction using the validated a_tag_to_use
try:
document_id = a_tag_to_use.get('href') # Use .get() for safety
if not document_id:
logging.error(f"QUACKO.search() - extract_section_links(): Found 'a' tag for '{section_name}' has no href attribute.")
return {}
# Clean up the document ID
if document_id.startswith('./documents/'):
document_id = document_id.replace('./documents/', '')
if document_id.endswith('.html'):
document_id = document_id.replace('.html', '')
# Construct the full URLs unless in local-only mode
if asset_id == "local" and rml_id == "local":
result[section_name] = f"Local section found: {document_id}"
else:
result[section_name] = f"{QUACKO_HTML_PAGES}/{asset_id}/documents/{document_id}.html"
result[f"{section_name}_js"] = f"{QUACKO_BASE_URL}/{rml_id}/dossier-view/{asset_id}/{document_id}"
return result
except Exception as e: # Catch potential errors during processing
logging.error(f"QUACKO.search() - extract_section_links(): Error processing the validated link tag for '{section_name}': {str(e)}")
return {}