497 lines
No EOL
20 KiB
Python
497 lines
No EOL
20 KiB
Python
import requests
|
|
import urllib.parse
|
|
import json
|
|
import logging
|
|
import re
|
|
from datetime import datetime
|
|
from bs4 import BeautifulSoup
|
|
from typing import Dict, Union, Optional, Any
|
|
|
|
# Settings per il logging
|
|
logging.basicConfig(
|
|
format="{asctime} - {levelname} - {message}",
|
|
style="{",
|
|
datefmt="%Y-%m-%d %H:%M",
|
|
filename=".log",
|
|
encoding="utf-8",
|
|
filemode="a",
|
|
level=logging.INFO,
|
|
)
|
|
|
|
|
|
# Constants for API endpoints
|
|
QUACKO_BASE_URL = "https://chem.echa.europa.eu"
|
|
QUACKO_SUBSTANCE_API = f"{QUACKO_BASE_URL}/api-substance/v1/substance"
|
|
QUACKO_DOSSIER_API = f"{QUACKO_BASE_URL}/api-dossier-list/v1/dossier"
|
|
QUACKO_HTML_PAGES = f"{QUACKO_BASE_URL}/html-pages"
|
|
|
|
# Default sections to look for in the dossier
|
|
DEFAULT_SECTIONS = {
|
|
"id_7_Toxicologicalinformation": "ToxSummary",
|
|
"id_72_AcuteToxicity": "AcuteToxicity",
|
|
"id_75_Repeateddosetoxicity": "RepeatedDose",
|
|
"id_6_Ecotoxicologicalinformation": "EcotoxSummary",
|
|
"id_76_Genetictoxicity" : 'GeneticToxicity',
|
|
"id_42_Meltingpointfreezingpoint" : "MeltingFreezingPoint",
|
|
"id_43_Boilingpoint" : "BoilingPoint",
|
|
"id_48_Watersolubility" : "WaterSolubility",
|
|
"id_410_Surfacetension" : "SurfaceTension",
|
|
"id_420_pH" : "pH",
|
|
"Test" : "Test2"
|
|
}
|
|
|
|
def search_dossier(
|
|
substance: str,
|
|
input_type: str = 'rmlCas',
|
|
sections: Dict[str, str] = None,
|
|
local_index_path: str = None
|
|
) -> Union[Dict[str, Any], str, bool]:
|
|
"""
|
|
Search for a chemical substance in the QUACKO database and retrieve its dossier information.
|
|
|
|
Args:
|
|
substance (str): The identifier of the substance to search for (e.g. CAS number, name)
|
|
input_type (str): The type of identifier provided. Options: 'rmlCas', 'rmlName', 'rmlEc'
|
|
sections (Dict[str, str], optional): Dictionary mapping section IDs to result keys.
|
|
If None, default sections will be used.
|
|
local_index_path (str, optional): Path to a local index.html file to parse instead of
|
|
downloading from QUACKO. If provided, the function will
|
|
skip all API calls and only extract sections from the local file.
|
|
|
|
Returns:
|
|
Union[Dict[str, Any], str, bool]: Dictionary with substance information and dossier links on success,
|
|
error message string if substance found but with issues,
|
|
False if substance not found or other critical error
|
|
"""
|
|
# Use default sections if none provided
|
|
if sections is None:
|
|
sections = DEFAULT_SECTIONS
|
|
|
|
try:
|
|
results = {}
|
|
|
|
# If a local file is provided, extract sections from it directly
|
|
if local_index_path:
|
|
logging.info(f"QUACKO.search() - Using local index file: {local_index_path}")
|
|
|
|
# We still need some minimal info for constructing the URLs
|
|
if '/' not in local_index_path:
|
|
asset_id = "local"
|
|
rml_id = "local"
|
|
else:
|
|
# Try to extract information from the path if available
|
|
path_parts = local_index_path.split('/')
|
|
# If path follows expected structure: .../html-pages/ASSET_ID/index.html
|
|
if 'html-pages' in path_parts and 'index.html' in path_parts[-1]:
|
|
asset_id = path_parts[path_parts.index('html-pages') + 1]
|
|
rml_id = "extracted" # Just a placeholder
|
|
else:
|
|
asset_id = "local"
|
|
rml_id = "local"
|
|
|
|
# Add these to results for consistency
|
|
results["assetExternalId"] = asset_id
|
|
results["rmlId"] = rml_id
|
|
|
|
# Extract sections from the local file
|
|
section_links = get_section_links_from_file(local_index_path, asset_id, rml_id, sections)
|
|
if section_links:
|
|
results.update(section_links)
|
|
|
|
return results
|
|
|
|
# Normal flow with API calls
|
|
substance_data = get_substance_by_identifier(substance)
|
|
if not substance_data:
|
|
return False
|
|
|
|
# Verify that the found substance matches the input identifier
|
|
if substance_data.get(input_type) != substance:
|
|
error_msg = (f"Search error: results[{input_type}] (\"{substance_data.get(input_type)}\") "
|
|
f"is not equal to \"{substance}\". Maybe you specified the wrong input_type. "
|
|
f"Check the results here: {substance_data.get('search_response')}")
|
|
logging.error(f"QUACKO.search(): {error_msg}")
|
|
return error_msg
|
|
|
|
# Step 2: Find dossiers for the substance
|
|
rml_id = substance_data["rmlId"]
|
|
dossier_data = get_dossier_by_rml_id(rml_id, substance)
|
|
if not dossier_data:
|
|
return False
|
|
|
|
# Merge substance and dossier data
|
|
results = {**substance_data, **dossier_data}
|
|
|
|
# Step 3: Extract detailed information from dossier index page
|
|
asset_external_id = dossier_data["assetExternalId"]
|
|
section_links = get_section_links_from_index(asset_external_id, rml_id, sections)
|
|
if section_links:
|
|
results.update(section_links)
|
|
|
|
logging.info(f"QUACKO.search() OK. output: {json.dumps(results)}")
|
|
return results
|
|
|
|
except Exception as e:
|
|
logging.error(f"QUACKO.search(): Unexpected error in search_dossier for '{substance}': {str(e)}")
|
|
return False
|
|
|
|
|
|
def get_substance_by_identifier(substance: str) -> Optional[Dict[str, str]]:
|
|
"""
|
|
Search the QUACKO database for a substance using the provided identifier.
|
|
|
|
Args:
|
|
substance (str): The substance identifier to search for (CAS number, name, etc.)
|
|
|
|
Returns:
|
|
Optional[Dict[str, str]]: Dictionary with substance information or None if not found
|
|
"""
|
|
encoded_substance = urllib.parse.quote(substance)
|
|
search_url = f"{QUACKO_SUBSTANCE_API}?pageIndex=1&pageSize=100&searchText={encoded_substance}"
|
|
|
|
logging.info(f'QUACKO.search(). searching "{substance}"')
|
|
|
|
try:
|
|
response = requests.get(search_url)
|
|
response.raise_for_status() # Raise exception for HTTP errors
|
|
data = response.json()
|
|
|
|
if not data.get("items") or len(data["items"]) == 0:
|
|
logging.info(f"QUACKO.search() could not find substance for '{substance}'")
|
|
return None
|
|
|
|
# Extract substance information
|
|
substance_index = data["items"][0]["substanceIndex"]
|
|
result = {
|
|
'search_response': search_url,
|
|
'rmlId': substance_index.get("rmlId", ""),
|
|
'rmlName': substance_index.get("rmlName", ""),
|
|
'rmlCas': substance_index.get("rmlCas", ""),
|
|
'rmlEc': substance_index.get("rmlEc", "")
|
|
}
|
|
|
|
logging.info(
|
|
f"QUACKO.search() found substance on QUACKO. "
|
|
f"rmlId: '{result['rmlId']}', rmlName: '{result['rmlName']}', rmlCas: '{result['rmlCas']}'"
|
|
)
|
|
return result
|
|
|
|
except requests.RequestException as e:
|
|
logging.error(f"QUACKO.search() - Request error while searching for substance '{substance}': {str(e)}")
|
|
return None
|
|
except (KeyError, IndexError) as e:
|
|
logging.error(f"QUACKO.search() - Data parsing error for substance '{substance}': {str(e)}")
|
|
return None
|
|
|
|
|
|
def get_dossier_by_rml_id(rml_id: str, substance_name: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Find dossiers for a substance using its RML ID.
|
|
|
|
Args:
|
|
rml_id (str): The RML ID of the substance
|
|
substance_name (str): The name of the substance (for logging)
|
|
|
|
Returns:
|
|
Optional[Dict[str, Any]]: Dictionary with dossier information or None if not found
|
|
"""
|
|
# First try active dossiers
|
|
dossier_results = _query_dossier_api(rml_id, "Active")
|
|
|
|
# If no active dossiers found, try inactive ones
|
|
if not dossier_results:
|
|
logging.info(
|
|
f"QUACKO.search() - could not find active dossier for '{substance_name}'. "
|
|
"Proceeding to search in the unactive ones."
|
|
)
|
|
dossier_results = _query_dossier_api(rml_id, "Inactive")
|
|
|
|
if not dossier_results:
|
|
logging.info(f"QUACKO.search() - could not find unactive dossiers for '{substance_name}'")
|
|
return None
|
|
else:
|
|
logging.info(f"QUACKO.search() - found unactive dossiers for '{substance_name}'")
|
|
dossier_results["dossierType"] = "Inactive"
|
|
else:
|
|
logging.info(f"QUACKO.search() - found active dossiers for '{substance_name}'")
|
|
dossier_results["dossierType"] = "Active"
|
|
|
|
return dossier_results
|
|
|
|
|
|
def _query_dossier_api(rml_id: str, status: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Helper function to query the QUACKO dossier API for a specific substance and status.
|
|
|
|
Args:
|
|
rml_id (str): The RML ID of the substance
|
|
status (str): The status of dossiers to search for ('Active' or 'Inactive')
|
|
|
|
Returns:
|
|
Optional[Dict[str, Any]]: Dictionary with dossier information or None if not found
|
|
"""
|
|
url = f"{QUACKO_DOSSIER_API}?pageIndex=1&pageSize=100&rmlId={rml_id}®istrationStatuses={status}"
|
|
|
|
try:
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if not data.get("items") or len(data["items"]) == 0:
|
|
return None
|
|
|
|
result = {
|
|
"assetExternalId": data["items"][0]["assetExternalId"],
|
|
"rootKey": data["items"][0]["rootKey"],
|
|
}
|
|
|
|
# Extract last update date if available
|
|
try:
|
|
last_update = data["items"][0]["lastUpdatedDate"]
|
|
datetime_object = datetime.fromisoformat(last_update.replace('Z', '+00:00'))
|
|
result['lastUpdateDate'] = datetime_object.date().isoformat()
|
|
except (KeyError, ValueError) as e:
|
|
logging.error(f"QUACKO.search() - Error extracting lastUpdateDate: {str(e)}")
|
|
|
|
# Add index URLs
|
|
result["index"] = f"{QUACKO_HTML_PAGES}/{result['assetExternalId']}/index.html"
|
|
result["index_js"] = f"{QUACKO_BASE_URL}/{rml_id}/dossier-view/{result['assetExternalId']}"
|
|
|
|
return result
|
|
|
|
except requests.RequestException as e:
|
|
logging.error(f"QUACKO.search() - Request error while getting dossiers for RML ID '{rml_id}': {str(e)}")
|
|
return None
|
|
except (KeyError, IndexError) as e:
|
|
logging.error(f"QUACKO.search() - Data parsing error for RML ID '{rml_id}': {str(e)}")
|
|
return None
|
|
|
|
|
|
def get_section_links_from_index(
|
|
asset_id: str,
|
|
rml_id: str,
|
|
sections: Dict[str, str]
|
|
) -> Dict[str, str]:
|
|
"""
|
|
Extract links to specified sections from the dossier index page by downloading it.
|
|
|
|
Args:
|
|
asset_id (str): The asset external ID of the dossier
|
|
rml_id (str): The RML ID of the substance
|
|
sections (Dict[str, str]): Dictionary mapping section IDs to result keys
|
|
|
|
Returns:
|
|
Dict[str, str]: Dictionary with links to the requested sections
|
|
"""
|
|
index_url = f"{QUACKO_HTML_PAGES}/{asset_id}/index.html"
|
|
|
|
try:
|
|
response = requests.get(index_url)
|
|
response.raise_for_status()
|
|
|
|
# Parse content using the shared method
|
|
return parse_sections_from_html(response.text, asset_id, rml_id, sections)
|
|
|
|
except requests.RequestException as e:
|
|
logging.error(f"QUACKO.search() - Request error while extracting section links: {str(e)}")
|
|
return {}
|
|
except Exception as e:
|
|
logging.error(f"QUACKO.search() - Error extracting section links: {str(e)}")
|
|
return {}
|
|
|
|
|
|
def get_section_links_from_file(
|
|
file_path: str,
|
|
asset_id: str,
|
|
rml_id: str,
|
|
sections: Dict[str, str]
|
|
) -> Dict[str, str]:
|
|
"""
|
|
Extract links to specified sections from a local index.html file.
|
|
|
|
Args:
|
|
file_path (str): Path to the local index.html file
|
|
asset_id (str): The asset external ID to use for constructing URLs
|
|
rml_id (str): The RML ID to use for constructing URLs
|
|
sections (Dict[str, str]): Dictionary mapping section IDs to result keys
|
|
|
|
Returns:
|
|
Dict[str, str]: Dictionary with links to the requested sections
|
|
"""
|
|
try:
|
|
if not os.path.exists(file_path):
|
|
logging.error(f"QUACKO.search() - Local file not found: {file_path}")
|
|
return {}
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
html_content = file.read()
|
|
|
|
# Parse content using the shared method
|
|
return parse_sections_from_html(html_content, asset_id, rml_id, sections)
|
|
|
|
except FileNotFoundError:
|
|
logging.error(f"QUACKO.search() - File not found: {file_path}")
|
|
return {}
|
|
except Exception as e:
|
|
logging.error(f"QUACKO.search() - Error parsing local file {file_path}: {str(e)}")
|
|
return {}
|
|
|
|
|
|
def parse_sections_from_html(
|
|
html_content: str,
|
|
asset_id: str,
|
|
rml_id: str,
|
|
sections: Dict[str, str]
|
|
) -> Dict[str, str]:
|
|
"""
|
|
Parse HTML content to extract links to specified sections.
|
|
|
|
Args:
|
|
html_content (str): HTML content to parse
|
|
asset_id (str): The asset external ID to use for constructing URLs
|
|
rml_id (str): The RML ID to use for constructing URLs
|
|
sections (Dict[str, str]): Dictionary mapping section IDs to result keys
|
|
|
|
Returns:
|
|
Dict[str, str]: Dictionary with links to the requested sections
|
|
"""
|
|
result = {}
|
|
|
|
try:
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
|
|
# Extract each requested section
|
|
for section_id, section_name in sections.items():
|
|
section_links = extract_section_links(soup, section_id, asset_id, rml_id, section_name)
|
|
if section_links:
|
|
result.update(section_links)
|
|
logging.info(f"QUACKO.search() - Found section '{section_name}' in document")
|
|
else:
|
|
logging.info(f"QUACKO.search() - Section '{section_name}' not found in document")
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
logging.error(f"QUACKO.search() - Error parsing HTML content: {str(e)}")
|
|
return {}
|
|
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# Function to Extract Section Links with Validation
|
|
# --------------------------------------------------------------------------
|
|
# This function extracts the document link associated with a specific section ID
|
|
# from the QUACKO index.html page structure.
|
|
#
|
|
# Problem Solved:
|
|
# Previous attempts faced issues where searching for a link within a parent
|
|
# section's div (e.g., "7 Toxicological Information" with id="id_7_...")
|
|
# would incorrectly grab the link belonging to the *first child* section
|
|
# (e.g., "7.2 Acute Toxicity" with id="id_72_..."). This happened because
|
|
# the simple `find("a", href=True)` doesn't distinguish ownership when nested.
|
|
#
|
|
# Solution Logic:
|
|
# 1. Find Target Div: Locate the `div` element using the specific `section_id` provided.
|
|
# This div typically contains the section's content or nested subsections.
|
|
# 2. Find First Link: Find the very first `<a>` tag that has an `href` attribute
|
|
# somewhere *inside* the `target_div`.
|
|
# 3. Find Link's Owning Section Div: Starting from the `first_link_tag`, traverse
|
|
# up the HTML tree using `find_parent()` to find the nearest ancestor `div`
|
|
# whose `id` attribute starts with "id_" (the pattern for section containers).
|
|
# 4. Validate Ownership: Compare the `id` of the `link_ancestor_section_div` found
|
|
# in step 3 with the original `section_id` passed into the function.
|
|
# 5. Decision:
|
|
# - If the IDs MATCH: It confirms that the `first_link_tag` truly belongs to the
|
|
# `section_id` we are querying. The function proceeds to extract and format
|
|
# this link.
|
|
# - If the IDs DO NOT MATCH: It indicates that the first link found actually
|
|
# belongs to a *nested* subsection div. Therefore, the original `section_id`
|
|
# (the parent/container) does not have its own direct link, and the function
|
|
# correctly returns an empty dictionary for this `section_id`.
|
|
#
|
|
# This validation step ensures that we only return links that are directly
|
|
# associated with the queried section ID, preventing the inheritance bug.
|
|
# --------------------------------------------------------------------------
|
|
def extract_section_links(
|
|
soup: BeautifulSoup,
|
|
section_id: str,
|
|
asset_id: str,
|
|
rml_id: str,
|
|
section_name: str
|
|
) -> Dict[str, str]:
|
|
"""
|
|
Extracts a link for a specific section ID by finding the first link
|
|
within its div and verifying that the link belongs directly to that
|
|
section, not a nested subsection.
|
|
|
|
Args:
|
|
soup (BeautifulSoup): The BeautifulSoup object of the index page.
|
|
section_id (str): The HTML ID of the section div.
|
|
asset_id (str): The asset external ID of the dossier.
|
|
rml_id (str): The RML ID of the substance.
|
|
section_name (str): The name to use for the section in the result.
|
|
|
|
Returns:
|
|
Dict[str, str]: Dictionary with link if found and validated,
|
|
otherwise empty.
|
|
"""
|
|
result = {}
|
|
|
|
# 1. Find the target div for the section ID
|
|
target_div = soup.find("div", id=section_id)
|
|
if not target_div:
|
|
logging.info(f"QUACKO.search() - extract_section_links(): No div found for id='{section_id}'")
|
|
return result
|
|
|
|
# 2. Find the first <a> tag with an href within this target div
|
|
first_link_tag = target_div.find("a", href=True)
|
|
if not first_link_tag:
|
|
logging.info(f"QUACKO.search() - extract_section_links(): No 'a' tag with href found within div id='{section_id}'")
|
|
return result # No links at all within this section
|
|
|
|
# 3. Validate: Find the closest ancestor div with an ID starting with "id_"
|
|
# This tells us which section container the link *actually* resides in.
|
|
# We use a lambda function for the id check.
|
|
# Need to handle potential None if the structure is unexpected.
|
|
link_ancestor_section_div: Optional[Tag] = first_link_tag.find_parent(
|
|
"div", id=lambda x: x and x.startswith("id_")
|
|
)
|
|
|
|
# 4. Compare IDs
|
|
if link_ancestor_section_div and link_ancestor_section_div.get('id') == section_id:
|
|
# The first link found belongs directly to the section we are looking for.
|
|
logging.debug(f"QUACKO.search() - extract_section_links(): Valid link found for id='{section_id}'.")
|
|
a_tag_to_use = first_link_tag # Use the link we found
|
|
else:
|
|
# The first link found belongs to a *different* (nested) section
|
|
# or the structure is broken (no ancestor div with id found).
|
|
# Therefore, the section_id we were originally checking has no direct link.
|
|
ancestor_id = link_ancestor_section_div.get('id') if link_ancestor_section_div else "None"
|
|
logging.info(f"QUACKO.search() - extract_section_links(): First link within id='{section_id}' belongs to ancestor id='{ancestor_id}'. No direct link for '{section_id}'.")
|
|
return result # Return empty dict
|
|
|
|
# 5. Proceed with link extraction using the validated a_tag_to_use
|
|
try:
|
|
document_id = a_tag_to_use.get('href') # Use .get() for safety
|
|
if not document_id:
|
|
logging.error(f"QUACKO.search() - extract_section_links(): Found 'a' tag for '{section_name}' has no href attribute.")
|
|
return {}
|
|
|
|
# Clean up the document ID
|
|
if document_id.startswith('./documents/'):
|
|
document_id = document_id.replace('./documents/', '')
|
|
if document_id.endswith('.html'):
|
|
document_id = document_id.replace('.html', '')
|
|
|
|
# Construct the full URLs unless in local-only mode
|
|
if asset_id == "local" and rml_id == "local":
|
|
result[section_name] = f"Local section found: {document_id}"
|
|
else:
|
|
result[section_name] = f"{QUACKO_HTML_PAGES}/{asset_id}/documents/{document_id}.html"
|
|
result[f"{section_name}_js"] = f"{QUACKO_BASE_URL}/{rml_id}/dossier-view/{asset_id}/{document_id}"
|
|
|
|
return result
|
|
|
|
except Exception as e: # Catch potential errors during processing
|
|
logging.error(f"QUACKO.search() - extract_section_links(): Error processing the validated link tag for '{section_name}': {str(e)}")
|
|
return {} |