import os import requests import json import re from bs4 import BeautifulSoup from dotenv import load_dotenv from playwright.sync_api import sync_playwright from pif_compiler.services.common_log import get_logger from pif_compiler.services.db_utils import db_connect log = get_logger() load_dotenv() BASE_SEARCH = "https://chem.echa.europa.eu/api-substance/v1/substance?pageIndex=1&pageSize=100&searchText=" BASE_DOSSIER = "https://chem.echa.europa.eu/api-dossier-list/v1/dossier?pageIndex=1&pageSize=100&rmlId=" SUBSTANCE_SUMMARY = "https://chem.echa.europa.eu/api-substance/v1/substance/" #+id CLASSIFICATION_ID = "https://chem.echa.europa.eu/api-cnl-inventory/prominent/overview/classifications/harmonised/459160" TOXICOLOGICAL_INFO = "https://chem.echa.europa.eu/html-pages-prod/e4c88c6e-06c7-4daa-b0fb-1a55459ac22f/documents/IUC5-5f55d8ec-7a71-4e2c-9955-8469ead9fe84_0035f3f8-7467-4944-9028-1db2e9c99565.html" # external + rootkey REPEATED_DOSE = "https://chem.echa.europa.eu/html-pages-prod/e4c88c6e-06c7-4daa-b0fb-1a55459ac22f/documents/IUC5-82402b09-8d8f-495c-b673-95b205be60e0_0035f3f8-7467-4944-9028-1db2e9c99565.html" active = "®istrationStatuses=Active" inactive = "®istrationStatuses=Inactive" legislation = "&legislation=REACH" #region ECHA scraping functions def search_substance(cas : str) -> dict: response = requests.get(BASE_SEARCH + cas) if response.status_code != 200: log.error(f"Network error: {response.status_code}") return {} else: response = response.json() if response['state']['totalItems'] == 0: log.info(f"No substance found for CAS {cas}") return {} else: for result in response['items']: if result["substanceIndex"]["rmlCas"] == cas: substance = { "rmlCas": result["substanceIndex"]["rmlCas"], "rmlId": result["substanceIndex"]["rmlId"], "rmlEc": result["substanceIndex"]["rmlEc"], "rmlName": result["substanceIndex"]["rmlName"], "rmlId": result["substanceIndex"]["rmlId"] } return substance log.error(f"Something went wrong") return {} def get_dossier_info(rmlId: str) -> dict: url = BASE_DOSSIER + rmlId + active + legislation response_dossier = requests.get(url) if response_dossier.status_code != 200: log.error(f"Network error: {response_dossier.status_code}") return {} response_dossier_json = response_dossier.json() if response_dossier_json['state']['totalItems'] == 0: log.info(f"No dossier found for RML ID {rmlId}") return {} dossier_info = { "lastUpdatedDate": response_dossier_json['items'][0]['lastUpdatedDate'], "registrationStatus": response_dossier_json['items'][0]['registrationStatus'], "registrationStatusChangedDate": response_dossier_json['items'][0]['registrationStatusChangedDate'], "registrationRole": response_dossier_json['items'][0]['reachDossierInfo']['registrationRole'], "assetExternalId": response_dossier_json['items'][0]['assetExternalId'], "rootKey": response_dossier_json['items'][0]['rootKey'] } return dossier_info def get_substance_index(assetExternalId : str) -> dict: INDEX = "https://chem.echa.europa.eu/html-pages-prod/" + assetExternalId LINK_DOSSIER = INDEX + "/documents/" response = requests.get(INDEX + "/index.html") if response.status_code != 200: log.error(f"Network error: {response.status_code}") return {} soup = BeautifulSoup(response.content, 'html.parser') index_data = {} # Toxicological information : txi txi_div = soup.find('div', id='id_7_Toxicologicalinformation') txi_link = txi_div.find('a', class_='das-leaf') txi_href = txi_link['href'] index_data['toxicological_information_link'] = LINK_DOSSIER + txi_href + '.html' # Repeated dose toxicity : rdt rdt_div = soup.find('div', id='id_75_Repeateddosetoxicity') rdt_link = rdt_div.find('a', class_='das-leaf') rdt_href = rdt_link['href'] index_data['repeated_dose_toxicity_link'] = LINK_DOSSIER + rdt_href + '.html' # Acute toxicity : at at_div = soup.find('div', id='id_72_AcuteToxicity') at_link = at_div.find('a', class_='das-leaf') at_href = at_link['href'] index_data['acute_toxicity_link'] = LINK_DOSSIER + at_href + '.html' return index_data #endregion #region ECHA parsing functions of html pages def get_field_name(field_div): """Extract field name from the class attribute of label div""" label_div = field_div.find('div', class_='das-field_label') if not label_div: return None classes = label_div.get('class', []) for cls in classes: if cls not in ['das-field_label', 'das-empty-value', 'das-empty-label']: return cls return None def extract_field_value(field_div): """Extract value from a das-field div""" field_name = get_field_name(field_div) if not field_name: return None # Skip OriginalStudy fields if field_name == 'OriginalStudy': return None value_div = field_div.find('div', class_='das-field_value') if not value_div: return None # Exclude redacted/not publishable redacted = value_div.find('span', class_='das-redacted-value') if redacted: return None # Check if empty empty_span = value_div.find('span', class_='das-empty-value') if empty_span and not value_div.find('span', class_='das-redacted-value'): return {field_name: ""} # Extract pick-list value pick_list = value_div.find('span', class_='das-field_value_pick-list') if pick_list: phrase = pick_list.find('span', class_='phrase') if phrase: return {field_name: phrase.get_text(strip=True)} if pick_list.find('span', class_='das-empty-value'): return {field_name: ""} # Extract quantity value (value + unit) quantity = value_div.find('span', class_='i6PhysicalQuantity') if quantity: value_span = quantity.find('span', class_='value') unit_span = quantity.find('span', class_='unit') value_text = value_span.get_text(strip=True) if value_span else "" unit_text = "" if unit_span: unit_phrase = unit_span.find('span', class_='phrase') if unit_phrase: unit_text = unit_phrase.get_text(strip=True) elif unit_span.find('span', class_='das-empty-value'): unit_text = "" if value_text: return {field_name: {"value": value_text, "unit": unit_text}} else: return {field_name: ""} # Extract checkbox value checkbox_checked = value_div.find('span', class_='das-value_checkbox-checked') checkbox_unchecked = value_div.find('span', class_='das-value_checkbox-unchecked') if checkbox_checked is not None or checkbox_unchecked is not None: return {field_name: checkbox_checked is not None} # Extract decimal/numeric value if 'das-field_decimal' in field_div.get('class', []) or 'das-field_text' in field_div.get('class', []): text = value_div.get_text(strip=True) if '[Empty]' in text or not text: return {field_name: ""} return {field_name: text} # Extract HTML/text content if value_div.find('div', class_='das-field_value_html'): html_content = value_div.find('div', class_='das-field_value_html') text = html_content.get_text(separator=' ', strip=True) text = re.sub(r'\[Empty\]', '', text).strip() if not text: return {field_name: ""} return {field_name: text} # Default: get text content text = value_div.get_text(strip=True) text = re.sub(r'\[Empty\]', '', text).strip() return {field_name: text if text else ""} def extract_table_data(table): """Extract table data as array of objects""" rows = table.find_all('tr') if len(rows) < 2: return [] header_row = rows[0] headers = [] for th in header_row.find_all('td'): header_text = th.get_text(strip=True) headers.append(header_text) data = [] for row in rows[1:]: cells = row.find_all('td') if len(cells) == 1 and cells[0].get('colspan'): continue if len(cells) == len(headers): row_data = {} for i, cell in enumerate(cells): cell_text = cell.get_text(strip=True) row_data[headers[i]] = cell_text data.append(row_data) return data def extract_section(section): """Recursively extract data from a section""" section_data = {} label_h3 = section.find('h3', class_='das-block_label', recursive=False) if label_h3: section_data['label'] = label_h3.get_text(strip=True) direct_fields = section.find_all('div', class_='das-field', recursive=False) for field in direct_fields: field_data = extract_field_value(field) if field_data: section_data.update(field_data) tables = section.find_all('table', recursive=False) for i, table in enumerate(tables): table_data = extract_table_data(table) if table_data: table_key = f'table_{i+1}' if len(tables) > 1 else 'table' section_data[table_key] = table_data nested_sections = section.find_all('section', class_='das-block', recursive=False) if nested_sections: section_data['subsections'] = [] for nested in nested_sections: nested_data = extract_section(nested) if nested_data: section_data['subsections'].append(nested_data) return section_data def parse_toxicology_html(html_content): """Main function to parse the toxicological HTML document""" soup = BeautifulSoup(html_content, 'html.parser') result = {} title = soup.find('h4', class_='document-header') if title: result['document_title'] = title.get_text(strip=True) article = soup.find('article', class_='das-document') if not article: return result top_sections = article.find_all('section', class_='das-block', recursive=False) result['sections'] = [] for section in top_sections: section_data = extract_section(section) if section_data: result['sections'].append(section_data) return result #endregion #region PDF extraction functions def generate_pdf_from_toxicology_info(index: dict): with sync_playwright() as p: browser = p.chromium.launch() page = browser.new_page() page.goto(index['toxicological_information_link']) page.pdf(path=f'pdfs/{index["substance"]["rmlCas"]}.pdf') browser.close() #endregion #region Orchestrator functions def echa_flow(cas) -> dict: try: substance = search_substance(cas) dossier_info = get_dossier_info(substance['rmlId']) index = get_substance_index(dossier_info['assetExternalId']) except Exception as e: log.error(f"Error in ECHA flow for CAS {cas}: {e}") return {} result = { "substance": substance, "dossier_info": dossier_info, "index": index, "toxicological_information": {}, "acute_toxicity": {}, "repeated_dose_toxicity": {} } # Fetch and parse toxicological information txi_link = index.get('toxicological_information_link') if txi_link: response_summary = requests.get(txi_link) if response_summary.status_code == 200: result['toxicological_information'] = parse_toxicology_html(response_summary.content) # Fetch and parse acute toxicity at_link = index.get('acute_toxicity_link') if at_link: response_acute = requests.get(at_link) if response_acute.status_code == 200: result['acute_toxicity'] = parse_toxicology_html(response_acute.content) # Fetch and parse repeated dose toxicity rdt_link = index.get('repeated_dose_toxicity_link') if rdt_link: response_repeated = requests.get(rdt_link) if response_repeated.status_code == 200: result['repeated_dose_toxicity'] = parse_toxicology_html(response_repeated.content) for key, value in result.items(): if value is None or value == "" or value == [] or value == {}: return False return result def cas_validation(cas: str) -> str: log.info(f"Starting ECHA data extraction for CAS: {cas}") if cas is None or cas.strip() == "": log.error("No CAS number provided.") return None cas_stripped = cas.replace("-", "") if cas_stripped.isdigit() and len(cas_stripped) <= 12: log.info(f"CAS number {cas} maybe is valid.") return cas.strip() else: log.error(f"CAS number {cas} is not valid.") return None def check_local(cas: str) -> bool: client, db, collection = db_connect() if not collection: log.error("No MongoDB collection available.") return None record = collection.find_one({"substance.rmlCas": cas}) if record: log.info(f"Record for CAS {cas} found in local database.") return record else: log.info(f"No record for CAS {cas} found in local database.") return None def add_to_local(data: dict) -> bool: client, db, collection = db_connect() if not collection: log.error("No MongoDB collection available.") return False try: collection.insert_one(data) log.info(f"Data for CAS {data['substance']['rmlCas']} added to local database.") return True except Exception as e: log.error(f"Error inserting data into MongoDB: {e}") return False def search_substance(cas: str) -> dict: cas_validated = cas_validation(cas) if not cas_validated: return None else: local_record = check_local(cas_validated) if local_record: return local_record else: echa_data = echa_flow(cas_validated) if echa_data: add_to_local(echa_data) return echa_data else: log.error(f"Failed to retrieve ECHA data for CAS {cas}.") return None # to do: check if document is complete # to do: check lastupdate #endregion