import marimo __generated_with = "0.16.5" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo import urllib.parse import re as standardre import json from bs4 import BeautifulSoup import requests return BeautifulSoup, mo, requests, urllib @app.cell def _(): from pif_compiler.services.common_log import get_logger log = get_logger() return (log,) @app.cell def _(log): log.info("testing with marimo") return @app.cell def _(): cas_test = "100-41-4" return (cas_test,) @app.cell def _(cas_test, urllib): urllib.parse.quote(cas_test) return @app.cell def _(): BASE_SEARCH = "https://chem.echa.europa.eu/api-substance/v1/substance?pageIndex=1&pageSize=100&searchText=" BASE_DOSSIER_LIST = "https://chem.echa.europa.eu/api-dossier-list/v1/dossier?pageIndex=1&pageSize=100&rmlId=" SUBSTANCE_SUMMARY = "https://chem.echa.europa.eu/api-substance/v1/substance/" #+id CLASSIFICATION_ID = "https://chem.echa.europa.eu/api-cnl-inventory/prominent/overview/classifications/harmonised/459160" TOXICOLOGICAL_INFO = "https://chem.echa.europa.eu/html-pages-prod/e4c88c6e-06c7-4daa-b0fb-1a55459ac22f/documents/IUC5-5f55d8ec-7a71-4e2c-9955-8469ead9fe84_0035f3f8-7467-4944-9028-1db2e9c99565.html" # external + rootkey REPEATED_DOSE = "https://chem.echa.europa.eu/html-pages-prod/e4c88c6e-06c7-4daa-b0fb-1a55459ac22f/documents/IUC5-82402b09-8d8f-495c-b673-95b205be60e0_0035f3f8-7467-4944-9028-1db2e9c99565.html" active = "®istrationStatuses=Active" inactive = "®istrationStatuses=Inactive" legislation = "&legislation=REACH" return BASE_SEARCH, active, legislation @app.cell def _(BASE_SEARCH, cas_test, requests): test_search_request = requests.get(BASE_SEARCH + cas_test) return (test_search_request,) @app.cell def _(test_search_request): response = test_search_request.json() return (response,) @app.cell def _(test_search_request): test_search_request.json() return @app.cell def _(cas_test, response): substance = {} for result in response['items']: if result["substanceIndex"]["rmlCas"] == cas_test: substance["rmlCas"] = result["substanceIndex"]["rmlCas"] substance["rmlId"] = result["substanceIndex"]["rmlId"] substance["rmlEc"] = result["substanceIndex"]["rmlEc"] substance["rmlName"] = result["substanceIndex"]["rmlName"] substance["rmlId"] = result["substanceIndex"]["rmlId"] return (substance,) @app.cell def _(substance): substance return @app.cell def _(BASE_DOSSIER, active, substance): url = BASE_DOSSIER + substance['rmlId'] + active url return @app.cell def _(BASE_DOSSIER, active, legislation, requests, substance): response_dossier = requests.get(BASE_DOSSIER + substance['rmlId'] + active + legislation) return (response_dossier,) @app.cell def _(response_dossier): response_dossier_json = response_dossier.json() response_dossier_json return (response_dossier_json,) @app.cell def _(response_dossier_json, substance): substance['lastUpdatedDate'] = response_dossier_json['items'][0]['lastUpdatedDate'] substance['registrationStatus'] = response_dossier_json['items'][0]['registrationStatus'] substance['registrationStatusChangedDate'] = response_dossier_json['items'][0]['registrationStatusChangedDate'] substance['registrationRole'] = response_dossier_json['items'][0]['reachDossierInfo']['registrationRole'] substance['assetExternalId'] = response_dossier_json['items'][0]['assetExternalId'] substance['rootKey'] = response_dossier_json['items'][0]['rootKey'] substance return @app.cell def _(): from pif_compiler.services.mongo_conn import get_client client = get_client() db = client.get_database(name="toxinfo") return (db,) @app.cell def _(db): collection = db.get_collection("substance_index") list = db.list_collection_names() print(list) return (collection,) @app.cell def _(cas_test, collection, substance): sub = collection.find_one({"rmlCas": cas_test}) if not sub: collection.insert_one(substance) return @app.cell def _(assetExternalId): INDEX_HTML = "https://chem.echa.europa.eu/html-pages/" + assetExternalId + "/index.html" return @app.cell def _(log, test_search_request): def search_substance(cas : str) -> dict: response = test_search_request.json() if response.status_code != 200: log.error(f"Network error: {response.status_code}") return {} else: if response['totalItems'] == 0: log.info(f"No substance found for CAS {cas}") return {} else: for result in response['items']: if result["substanceIndex"]["rmlCas"] == cas: substance = { "rmlCas": result["substanceIndex"]["rmlCas"], "rmlId": result["substanceIndex"]["rmlId"], "rmlEc": result["substanceIndex"]["rmlEc"], "rmlName": result["substanceIndex"]["rmlName"], "rmlId": result["substanceIndex"]["rmlId"] } return substance log.error(f"Something went wrong") return {} return @app.cell def _(BASE_DOSSIER, active, legislation, log, requests): def get_dossier_info(rmlId: str) -> dict: url = BASE_DOSSIER + rmlId + active + legislation response_dossier = requests.get(url) if response_dossier.status_code != 200: log.error(f"Network error: {response_dossier.status_code}") return {} response_dossier_json = response_dossier.json() if response_dossier_json['totalItems'] == 0: log.info(f"No dossier found for RML ID {rmlId}") return {} dossier_info = { "lastUpdatedDate": response_dossier_json['items'][0]['lastUpdatedDate'], "registrationStatus": response_dossier_json['items'][0]['registrationStatus'], "registrationStatusChangedDate": response_dossier_json['items'][0]['registrationStatusChangedDate'], "registrationRole": response_dossier_json['items'][0]['reachDossierInfo']['registrationRole'], "assetExternalId": response_dossier_json['items'][0]['assetExternalId'], "rootKey": response_dossier_json['items'][0]['rootKey'] } return dossier_info return @app.cell def _(BeautifulSoup, log, requests): def get_substance_index(assetExternalId : str) -> dict: INDEX = "https://chem.echa.europa.eu/html-pages-prod/" + assetExternalId LINK_DOSSIER = INDEX + "/documents/" response = requests.get(INDEX + "/index.html") if response.status_code != 200: log.error(f"Network error: {response.status_code}") return {} soup = BeautifulSoup(response.content, 'html.parser') index_data = {} # Toxicological information : txi txi_div = soup.find('div', id='id_7_Toxicologicalinformation') txi_link = txi_div.find('a', class_='das-leaf') txi_href = txi_link['href'] index_data['toxicological_information_link'] = LINK_DOSSIER + txi_href + '.html' # Repeated dose toxicity : rdt rdt_div = soup.find('div', id='id_75_Repeateddosetoxicity') rdt_link = rdt_div.find('a', class_='das-leaf') rdt_href = rdt_link['href'] index_data['repeated_dose_toxicity_link'] = LINK_DOSSIER + rdt_href + '.html' # Acute toxicity : at at_div = soup.find('div', id='id_72_AcuteToxicity') at_link = at_div.find('a', class_='das-leaf') at_href = at_link['href'] index_data['acute_toxicity_link'] = LINK_DOSSIER + at_href + '.html' return index_data get_substance_index("e4c88c6e-06c7-4daa-b0fb-1a55459ac22f") return @app.cell(hide_code=True) def _(mo): mo.md( r""" # Cosa manca da fare 1. Creare un nuovo orchestratore per la parte search, caching in mongodb e creare un metodo unico per la ricerca 2. Metodo per validare i json salvati nel database, verificare la data 3. Creare i metodi per astrarre gli html in json 4. Creare i test per ciascuna funzione 5. Creare la documentazione per ciascuna funzione """ ) return if __name__ == "__main__": app.run()