cosmoguard-bd/debug_echa_find.py

import marimo

__generated_with = "0.16.5"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    import urllib.parse
    import re as standardre
    import json
    from bs4 import BeautifulSoup
    import requests
    return BeautifulSoup, mo, requests, urllib


@app.cell
def _():
    from pif_compiler.services.common_log import get_logger

    log = get_logger()
    return (log,)


@app.cell
def _(log):
    log.info("testing with marimo")
    return


@app.cell
def _():
    cas_test = "100-41-4"
    return (cas_test,)


@app.cell
def _(cas_test, urllib):
    urllib.parse.quote(cas_test)
    return


@app.cell
def _():
    BASE_SEARCH = "https://chem.echa.europa.eu/api-substance/v1/substance?pageIndex=1&pageSize=100&searchText="
    BASE_DOSSIER_LIST = "https://chem.echa.europa.eu/api-dossier-list/v1/dossier?pageIndex=1&pageSize=100&rmlId="
    SUBSTANCE_SUMMARY = "https://chem.echa.europa.eu/api-substance/v1/substance/" #+id
    CLASSIFICATION_ID = "https://chem.echa.europa.eu/api-cnl-inventory/prominent/overview/classifications/harmonised/459160"
    TOXICOLOGICAL_INFO = "https://chem.echa.europa.eu/html-pages-prod/e4c88c6e-06c7-4daa-b0fb-1a55459ac22f/documents/IUC5-5f55d8ec-7a71-4e2c-9955-8469ead9fe84_0035f3f8-7467-4944-9028-1db2e9c99565.html" # external + rootkey
    REPEATED_DOSE = "https://chem.echa.europa.eu/html-pages-prod/e4c88c6e-06c7-4daa-b0fb-1a55459ac22f/documents/IUC5-82402b09-8d8f-495c-b673-95b205be60e0_0035f3f8-7467-4944-9028-1db2e9c99565.html"

    active = "&registrationStatuses=Active"
    inactive = "&registrationStatuses=Inactive"
    legislation = "&legislation=REACH"
    return BASE_SEARCH, active, legislation


@app.cell
def _(BASE_SEARCH, cas_test, requests):
    test_search_request = requests.get(BASE_SEARCH + cas_test)
    return (test_search_request,)


@app.cell
def _(test_search_request):
    response = test_search_request.json()
    return (response,)


@app.cell
def _(test_search_request):
    test_search_request.json()
    return


@app.cell
def _(cas_test, response):
    substance = {}

    for result in response['items']:
        if result["substanceIndex"]["rmlCas"] == cas_test:
            substance["rmlCas"] = result["substanceIndex"]["rmlCas"]
            substance["rmlId"] = result["substanceIndex"]["rmlId"]
            substance["rmlEc"] = result["substanceIndex"]["rmlEc"]
            substance["rmlName"] = result["substanceIndex"]["rmlName"]
            substance["rmlId"] = result["substanceIndex"]["rmlId"]
    return (substance,)


@app.cell
def _(substance):
    substance
    return


@app.cell
def _(BASE_DOSSIER, active, substance):
    url = BASE_DOSSIER + substance['rmlId'] + active
    url
    return


@app.cell
def _(BASE_DOSSIER, active, legislation, requests, substance):
    response_dossier = requests.get(BASE_DOSSIER + substance['rmlId'] + active + legislation)
    return (response_dossier,)


@app.cell
def _(response_dossier):
    response_dossier_json = response_dossier.json()
    response_dossier_json
    return (response_dossier_json,)


@app.cell
def _(response_dossier_json, substance):
    substance['lastUpdatedDate'] = response_dossier_json['items'][0]['lastUpdatedDate']
    substance['registrationStatus'] = response_dossier_json['items'][0]['registrationStatus']
    substance['registrationStatusChangedDate'] = response_dossier_json['items'][0]['registrationStatusChangedDate']
    substance['registrationRole'] = response_dossier_json['items'][0]['reachDossierInfo']['registrationRole']
    substance['assetExternalId'] = response_dossier_json['items'][0]['assetExternalId']
    substance['rootKey'] = response_dossier_json['items'][0]['rootKey']
    substance
    return


@app.cell
def _():
    from pif_compiler.services.mongo_conn import get_client

    client = get_client()

    db = client.get_database(name="toxinfo")
    return (db,)


@app.cell
def _(db):
    collection = db.get_collection("substance_index")
    list = db.list_collection_names()
    print(list)
    return (collection,)


@app.cell
def _(cas_test, collection, substance):
    sub = collection.find_one({"rmlCas": cas_test})
    if not sub:
        collection.insert_one(substance)
    return


@app.cell
def _(assetExternalId):
    INDEX_HTML = "https://chem.echa.europa.eu/html-pages/" + assetExternalId + "/index.html"
    return


@app.cell
def _(log, test_search_request):
    def search_substance(cas : str) -> dict:
        response = test_search_request.json()
        if response.status_code != 200:
            log.error(f"Network error: {response.status_code}")
            return {}
        else:
            if response['totalItems'] == 0:
                log.info(f"No substance found for CAS {cas}")
                return {}
            else:
                for result in response['items']:
                    if result["substanceIndex"]["rmlCas"] == cas:
                        substance = {
                            "rmlCas": result["substanceIndex"]["rmlCas"],
                            "rmlId": result["substanceIndex"]["rmlId"],
                            "rmlEc": result["substanceIndex"]["rmlEc"],
                            "rmlName": result["substanceIndex"]["rmlName"],
                            "rmlId": result["substanceIndex"]["rmlId"]
                        }
                        return substance
        log.error(f"Something went wrong")
        return {}
    return


@app.cell
def _(BASE_DOSSIER, active, legislation, log, requests):
    def get_dossier_info(rmlId: str) -> dict:
        url = BASE_DOSSIER + rmlId + active + legislation
        response_dossier = requests.get(url)
        if response_dossier.status_code != 200:
            log.error(f"Network error: {response_dossier.status_code}")
            return {}
        response_dossier_json = response_dossier.json()
        if response_dossier_json['totalItems'] == 0:
            log.info(f"No dossier found for RML ID {rmlId}")
            return {}
        dossier_info = {
            "lastUpdatedDate": response_dossier_json['items'][0]['lastUpdatedDate'],
            "registrationStatus": response_dossier_json['items'][0]['registrationStatus'],
            "registrationStatusChangedDate": response_dossier_json['items'][0]['registrationStatusChangedDate'],
            "registrationRole": response_dossier_json['items'][0]['reachDossierInfo']['registrationRole'],
            "assetExternalId": response_dossier_json['items'][0]['assetExternalId'],
            "rootKey": response_dossier_json['items'][0]['rootKey']
        }
        return dossier_info
    return


@app.cell
def _(BeautifulSoup, log, requests):
    def get_substance_index(assetExternalId : str) -> dict:
        INDEX = "https://chem.echa.europa.eu/html-pages-prod/" + assetExternalId
        LINK_DOSSIER = INDEX + "/documents/"

        response = requests.get(INDEX + "/index.html")
        if response.status_code != 200:
            log.error(f"Network error: {response.status_code}")
            return {}

        soup = BeautifulSoup(response.content, 'html.parser')
        index_data = {}

        # Toxicological information : txi

        txi_div = soup.find('div', id='id_7_Toxicologicalinformation')
        txi_link = txi_div.find('a', class_='das-leaf')
        txi_href = txi_link['href']
        index_data['toxicological_information_link'] = LINK_DOSSIER + txi_href + '.html'

        # Repeated dose toxicity : rdt

        rdt_div = soup.find('div', id='id_75_Repeateddosetoxicity')
        rdt_link = rdt_div.find('a', class_='das-leaf')
        rdt_href = rdt_link['href']
        index_data['repeated_dose_toxicity_link'] = LINK_DOSSIER + rdt_href + '.html'

        # Acute toxicity : at

        at_div = soup.find('div', id='id_72_AcuteToxicity')
        at_link = at_div.find('a', class_='das-leaf')
        at_href = at_link['href']
        index_data['acute_toxicity_link'] = LINK_DOSSIER + at_href + '.html'

        return index_data

    get_substance_index("e4c88c6e-06c7-4daa-b0fb-1a55459ac22f")
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(
        r"""
    # Cosa manca da fare

    1. Creare un nuovo orchestratore per la parte search, caching in mongodb e creare un metodo unico per la ricerca
    2. Metodo per validare i json salvati nel database, verificare la data
    3. Creare i metodi per astrarre gli html in json
    4. Creare i test per ciascuna funzione
    5. Creare la documentazione per ciascuna funzione
    """
    )
    return


if __name__ == "__main__":
    app.run()