270 lines
8.4 KiB
Python
270 lines
8.4 KiB
Python
import marimo
|
|
|
|
__generated_with = "0.16.5"
|
|
app = marimo.App(width="medium")
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
import marimo as mo
|
|
import urllib.parse
|
|
import re as standardre
|
|
import json
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
return BeautifulSoup, mo, requests, urllib
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
from pif_compiler.services.common_log import get_logger
|
|
|
|
log = get_logger()
|
|
return (log,)
|
|
|
|
|
|
@app.cell
|
|
def _(log):
|
|
log.info("testing with marimo")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
cas_test = "100-41-4"
|
|
return (cas_test,)
|
|
|
|
|
|
@app.cell
|
|
def _(cas_test, urllib):
|
|
urllib.parse.quote(cas_test)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
BASE_SEARCH = "https://chem.echa.europa.eu/api-substance/v1/substance?pageIndex=1&pageSize=100&searchText="
|
|
BASE_DOSSIER_LIST = "https://chem.echa.europa.eu/api-dossier-list/v1/dossier?pageIndex=1&pageSize=100&rmlId="
|
|
SUBSTANCE_SUMMARY = "https://chem.echa.europa.eu/api-substance/v1/substance/" #+id
|
|
CLASSIFICATION_ID = "https://chem.echa.europa.eu/api-cnl-inventory/prominent/overview/classifications/harmonised/459160"
|
|
TOXICOLOGICAL_INFO = "https://chem.echa.europa.eu/html-pages-prod/e4c88c6e-06c7-4daa-b0fb-1a55459ac22f/documents/IUC5-5f55d8ec-7a71-4e2c-9955-8469ead9fe84_0035f3f8-7467-4944-9028-1db2e9c99565.html" # external + rootkey
|
|
REPEATED_DOSE = "https://chem.echa.europa.eu/html-pages-prod/e4c88c6e-06c7-4daa-b0fb-1a55459ac22f/documents/IUC5-82402b09-8d8f-495c-b673-95b205be60e0_0035f3f8-7467-4944-9028-1db2e9c99565.html"
|
|
|
|
active = "®istrationStatuses=Active"
|
|
inactive = "®istrationStatuses=Inactive"
|
|
legislation = "&legislation=REACH"
|
|
return BASE_SEARCH, active, legislation
|
|
|
|
|
|
@app.cell
|
|
def _(BASE_SEARCH, cas_test, requests):
|
|
test_search_request = requests.get(BASE_SEARCH + cas_test)
|
|
return (test_search_request,)
|
|
|
|
|
|
@app.cell
|
|
def _(test_search_request):
|
|
response = test_search_request.json()
|
|
return (response,)
|
|
|
|
|
|
@app.cell
|
|
def _(test_search_request):
|
|
test_search_request.json()
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(cas_test, response):
|
|
substance = {}
|
|
|
|
for result in response['items']:
|
|
if result["substanceIndex"]["rmlCas"] == cas_test:
|
|
substance["rmlCas"] = result["substanceIndex"]["rmlCas"]
|
|
substance["rmlId"] = result["substanceIndex"]["rmlId"]
|
|
substance["rmlEc"] = result["substanceIndex"]["rmlEc"]
|
|
substance["rmlName"] = result["substanceIndex"]["rmlName"]
|
|
substance["rmlId"] = result["substanceIndex"]["rmlId"]
|
|
return (substance,)
|
|
|
|
|
|
@app.cell
|
|
def _(substance):
|
|
substance
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(BASE_DOSSIER, active, substance):
|
|
url = BASE_DOSSIER + substance['rmlId'] + active
|
|
url
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(BASE_DOSSIER, active, legislation, requests, substance):
|
|
response_dossier = requests.get(BASE_DOSSIER + substance['rmlId'] + active + legislation)
|
|
return (response_dossier,)
|
|
|
|
|
|
@app.cell
|
|
def _(response_dossier):
|
|
response_dossier_json = response_dossier.json()
|
|
response_dossier_json
|
|
return (response_dossier_json,)
|
|
|
|
|
|
@app.cell
|
|
def _(response_dossier_json, substance):
|
|
substance['lastUpdatedDate'] = response_dossier_json['items'][0]['lastUpdatedDate']
|
|
substance['registrationStatus'] = response_dossier_json['items'][0]['registrationStatus']
|
|
substance['registrationStatusChangedDate'] = response_dossier_json['items'][0]['registrationStatusChangedDate']
|
|
substance['registrationRole'] = response_dossier_json['items'][0]['reachDossierInfo']['registrationRole']
|
|
substance['assetExternalId'] = response_dossier_json['items'][0]['assetExternalId']
|
|
substance['rootKey'] = response_dossier_json['items'][0]['rootKey']
|
|
substance
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
from pif_compiler.services.mongo_conn import get_client
|
|
|
|
client = get_client()
|
|
|
|
db = client.get_database(name="toxinfo")
|
|
return (db,)
|
|
|
|
|
|
@app.cell
|
|
def _(db):
|
|
collection = db.get_collection("substance_index")
|
|
list = db.list_collection_names()
|
|
print(list)
|
|
return (collection,)
|
|
|
|
|
|
@app.cell
|
|
def _(cas_test, collection, substance):
|
|
sub = collection.find_one({"rmlCas": cas_test})
|
|
if not sub:
|
|
collection.insert_one(substance)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(assetExternalId):
|
|
INDEX_HTML = "https://chem.echa.europa.eu/html-pages/" + assetExternalId + "/index.html"
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(log, test_search_request):
|
|
def search_substance(cas : str) -> dict:
|
|
response = test_search_request.json()
|
|
if response.status_code != 200:
|
|
log.error(f"Network error: {response.status_code}")
|
|
return {}
|
|
else:
|
|
if response['totalItems'] == 0:
|
|
log.info(f"No substance found for CAS {cas}")
|
|
return {}
|
|
else:
|
|
for result in response['items']:
|
|
if result["substanceIndex"]["rmlCas"] == cas:
|
|
substance = {
|
|
"rmlCas": result["substanceIndex"]["rmlCas"],
|
|
"rmlId": result["substanceIndex"]["rmlId"],
|
|
"rmlEc": result["substanceIndex"]["rmlEc"],
|
|
"rmlName": result["substanceIndex"]["rmlName"],
|
|
"rmlId": result["substanceIndex"]["rmlId"]
|
|
}
|
|
return substance
|
|
log.error(f"Something went wrong")
|
|
return {}
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(BASE_DOSSIER, active, legislation, log, requests):
|
|
def get_dossier_info(rmlId: str) -> dict:
|
|
url = BASE_DOSSIER + rmlId + active + legislation
|
|
response_dossier = requests.get(url)
|
|
if response_dossier.status_code != 200:
|
|
log.error(f"Network error: {response_dossier.status_code}")
|
|
return {}
|
|
response_dossier_json = response_dossier.json()
|
|
if response_dossier_json['totalItems'] == 0:
|
|
log.info(f"No dossier found for RML ID {rmlId}")
|
|
return {}
|
|
dossier_info = {
|
|
"lastUpdatedDate": response_dossier_json['items'][0]['lastUpdatedDate'],
|
|
"registrationStatus": response_dossier_json['items'][0]['registrationStatus'],
|
|
"registrationStatusChangedDate": response_dossier_json['items'][0]['registrationStatusChangedDate'],
|
|
"registrationRole": response_dossier_json['items'][0]['reachDossierInfo']['registrationRole'],
|
|
"assetExternalId": response_dossier_json['items'][0]['assetExternalId'],
|
|
"rootKey": response_dossier_json['items'][0]['rootKey']
|
|
}
|
|
return dossier_info
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(BeautifulSoup, log, requests):
|
|
def get_substance_index(assetExternalId : str) -> dict:
|
|
INDEX = "https://chem.echa.europa.eu/html-pages-prod/" + assetExternalId
|
|
LINK_DOSSIER = INDEX + "/documents/"
|
|
|
|
response = requests.get(INDEX + "/index.html")
|
|
if response.status_code != 200:
|
|
log.error(f"Network error: {response.status_code}")
|
|
return {}
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
index_data = {}
|
|
|
|
# Toxicological information : txi
|
|
|
|
txi_div = soup.find('div', id='id_7_Toxicologicalinformation')
|
|
txi_link = txi_div.find('a', class_='das-leaf')
|
|
txi_href = txi_link['href']
|
|
index_data['toxicological_information_link'] = LINK_DOSSIER + txi_href + '.html'
|
|
|
|
# Repeated dose toxicity : rdt
|
|
|
|
rdt_div = soup.find('div', id='id_75_Repeateddosetoxicity')
|
|
rdt_link = rdt_div.find('a', class_='das-leaf')
|
|
rdt_href = rdt_link['href']
|
|
index_data['repeated_dose_toxicity_link'] = LINK_DOSSIER + rdt_href + '.html'
|
|
|
|
# Acute toxicity : at
|
|
|
|
at_div = soup.find('div', id='id_72_AcuteToxicity')
|
|
at_link = at_div.find('a', class_='das-leaf')
|
|
at_href = at_link['href']
|
|
index_data['acute_toxicity_link'] = LINK_DOSSIER + at_href + '.html'
|
|
|
|
return index_data
|
|
|
|
get_substance_index("e4c88c6e-06c7-4daa-b0fb-1a55459ac22f")
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(
|
|
r"""
|
|
# Cosa manca da fare
|
|
|
|
1. Creare un nuovo orchestratore per la parte search, caching in mongodb e creare un metodo unico per la ricerca
|
|
2. Metodo per validare i json salvati nel database, verificare la data
|
|
3. Creare i metodi per astrarre gli html in json
|
|
4. Creare i test per ciascuna funzione
|
|
5. Creare la documentazione per ciascuna funzione
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|