cosmoguard-bd/old/_old/echaFind.py
2025-11-15 16:02:37 +01:00

245 lines
9.6 KiB
Python

import requests
import urllib.parse
import re as standardre
import logging
import json
from bs4 import BeautifulSoup
# Settings per il logging
logging.basicConfig(
format="{asctime} - {levelname} - {message}",
style="{",
datefmt="%Y-%m-%d %H:%M",
filename="echa.log",
encoding="utf-8",
filemode="a",
level=logging.INFO,
)
# Funzione inutile
def getCas(substance, ):
results = {}
req_0 = requests.get(
"https://chem.echa.europa.eu/api-substance/v1/substance?pageIndex=1&pageSize=100&searchText="
+ urllib.parse.quote(substance)
)
req_0_json = req_0.json()
try:
rmlId = req_0_json["items"][0]["substanceIndex"]["rmlId"]
rmlName = req_0_json["items"][0]["substanceIndex"]["rmlName"]
rmlCas = req_0_json["items"][0]["substanceIndex"]["rmlCas"]
results["rmlId"] = rmlId
results["rmlName"] = rmlName
results["rmlCas"] = rmlCas
except:
return False
return results
# Funzione per cercare il dossier dato in input un CAS, una sostanza o un EN
def search_dossier(substance, input_type='rmlCas'):
results = {}
# Il dizionario che farò tornare alla fine
# Prima parte. Ottengo rmlID e rmlName
# st.code('https://chem.echa.europa.eu/api-substance/v1/substance?pageIndex=1&pageSize=100&searchText='+substance)
req_0 = requests.get(
"https://chem.echa.europa.eu/api-substance/v1/substance?pageIndex=1&pageSize=100&searchText="
+ urllib.parse.quote(substance)
)
logging.info(f'echaFind.search_dossier(). searching "{substance}"')
#'La prima cosa da fare è fare una ricerca con il nome della sostanza ma trasformata attraverso urllib'
req_0_json = req_0.json()
try:
# Estraggo i campi che mi servono dalla response
rmlId = req_0_json["items"][0]["substanceIndex"]["rmlId"]
rmlName = req_0_json["items"][0]["substanceIndex"]["rmlName"]
rmlCas = req_0_json["items"][0]["substanceIndex"]["rmlCas"]
rmlEc = req_0_json["items"][0]["substanceIndex"]["rmlEc"]
results['search_response'] = f"https://chem.echa.europa.eu/api-substance/v1/substance?pageIndex=1&pageSize=100&searchText={urllib.parse.quote(substance)}"
results["rmlId"] = rmlId
results["rmlName"] = rmlName
results["rmlCas"] = rmlCas
results["rmlEc"] = rmlEc
logging.info(
f"echaFind.search_dossier(). found substance on ECHA. rmlId: '{rmlId}', rmlName: '{rmlName}', rmlCas: '{rmlCas}'"
)
except:
logging.info(
f"echaFind.search_dossier(). could not find substance for '{substance}'"
)
return False
# Update: in certi casi poteva verificarsi che inserendo un CAS si trovasse invece una sostanza con codice EN uguale al CAS in input.
# Ora controllo che la sostanza trovata abbia effettivamente un CAS uguale a quello inserito in input.
# è inoltre possibile cercare per rmlName (nome della sostanza) o EN (rmlEn): basta specificare in input_type per cosa si sta cercando
if results[input_type] != substance:
logging.error(f'echa.echaFind.search_dossier(): results[{input_type}] "{results[input_type]}is not equal to "{substance}". ')
return f'search_error. results[{input_type}] ("{results[input_type]}") is not equal to "{substance}". Maybe you specified the wrong input_type. Check the results here: https://chem.echa.europa.eu/api-substance/v1/substance?pageIndex=1&pageSize=100&searchText={urllib.parse.quote(substance)}'
# Seconda parte. Cerco sul sito ECHA dei dossiers creando un link con l'ID precedentemente ottenuto.
req_1_url = (
"https://chem.echa.europa.eu/api-dossier-list/v1/dossier?pageIndex=1&pageSize=100&rmlId="
+ rmlId
+ "&registrationStatuses=Active"
) # Prima cerco negli active.
req_1 = requests.get(req_1_url)
req_1_json = req_1.json()
# Se non esistono dossiers attivi cerco quelli inattivi
if req_1_json["items"] == []:
logging.info(
f"echaFind.search_dossier(). could not find active dossier for '{substance}'. Proceeding to search in the unactive ones."
)
req_1_url = (
"https://chem.echa.europa.eu/api-dossier-list/v1/dossier?pageIndex=1&pageSize=100&rmlId="
+ rmlId
+ "&registrationStatuses=Inactive"
)
req_1 = requests.get(req_1_url)
req_1_json = req_1.json()
if req_1_json["items"] == []:
logging.info(
f"echaFind.search_dossier(). could not find unactive dossiers for '{substance}'"
) # Non ho trovato nè dossiers inattivi che attivi
return False
else:
logging.info(
f"echaFind.search_dossier(). found unactive dossiers for '{rmlName}'"
)
results["dossierType"] = "Inactive"
else:
logging.info(
f"echaFind.search_dossier(). found active dossiers for '{substance}'"
)
results["dossierType"] = "Active"
# Queste erano le due robe che mi servivano
assetExternalId = req_1_json["items"][0]["assetExternalId"]
# UPDATE: Per ottenere la data dell'ultima modifica
try:
lastUpdateDate = req_1_json["items"][0]["lastUpdatedDate"]
datetime_object = datetime.fromisoformat(lastUpdateDate.replace('Z', '+00:00')) # Handle 'Z' if present, else it might break on older python versions
lastUpdateDate = datetime_object.date().isoformat()
results['lastUpdateDate'] = lastUpdateDate
except:
logging.error(f"echa.echaFind(). Could not find lastUpdateDate for the dossier")
rootKey = req_1_json["items"][0]["rootKey"]
# Terza parte. Ottengo assetExternalId
# "Con l'assetExternalId è possibile arrivare alla pagina principale del dossier."
# "Da questa pagina bisogna scrappare l'ID del riassunto tossicologico, :red[**SE ESISTE**]"
results["index"] = (
"https://chem.echa.europa.eu/html-pages/" + assetExternalId + "/index.html"
)
results["index_js"] = (
f"https://chem.echa.europa.eu/{rmlId}/dossier-view/{assetExternalId}"
)
req_2 = requests.get(
"https://chem.echa.europa.eu/html-pages/" + assetExternalId + "/index.html"
)
index = BeautifulSoup(req_2.text, "html.parser")
index.prettify()
# Quarta parte. Ottengo l'ID del riassunto tossicologico dall'index.html
# "In tutto quell'HTML ci interessa solo un div. BeautifulSoup ha problemi se ci sono troppi div innestati. Quindi uso una combinazione di quello e di Regex"
div = index.find_all("div", id=["id_7_Toxicologicalinformation"])
str_div = str(div)
str_div = str_div.split("</div>")[0]
uic_found = False
if type(standardre.search('href="([^"]+)"', str_div)).__name__ == "NoneType":
# Un regex per trovare l'href che mi serve
logging.info(
f"echaFind.search_dossier(). Could not find 'id_7_Toxicologicalinformation' in the body"
)
else:
UIC = standardre.search('href="([^"]+)"', str_div).group(1)
uic_found = True
# Per l'acute toxicity
acute_toxicity_found = False
div_acute_toxicity = index.find_all("div", id=["id_72_AcuteToxicity"])
if div_acute_toxicity:
for div in div_acute_toxicity:
try:
a = div.find_all("a", href=True)[0]
acute_toxicity_id = standardre.search('href="([^"]+)"', str(a)).group(1)
acute_toxicity_found = True
except:
logging.info(
f"echaFind.search_dossier(). No acute_toxicity_id found from index for {substance}"
)
# Per il repeated dose
repeated_dose_found = False
div_repeated_dose = index.find_all("div", id=["id_75_Repeateddosetoxicity"])
if div_repeated_dose:
for div in div_repeated_dose:
try:
a = div.find_all("a", href=True)[0]
repeated_dose_id = standardre.search('href="([^"]+)"', str(a)).group(1)
repeated_dose_found = True
except:
logging.info(
f"echaFind.search_dossier(). No repeated_dose_id found from index for {substance}"
)
# Quinta parte. Recupero l'html del dossier tossicologico e faccio ritornare il content
if acute_toxicity_found:
acute_toxicity_link = (
"https://chem.echa.europa.eu/html-pages/"
+ assetExternalId
+ "/documents/"
+ acute_toxicity_id
+ ".html"
)
results["AcuteToxicity"] = acute_toxicity_link
results["AcuteToxicity_js"] = (
f"https://chem.echa.europa.eu/{rmlId}/dossier-view/{assetExternalId}/{acute_toxicity_id}"
)
if uic_found:
# UIC è l'id del toxsummary
final_url = (
"https://chem.echa.europa.eu/html-pages/"
+ assetExternalId
+ "/documents/"
+ UIC
+ ".html"
)
results["ToxSummary"] = final_url
results["ToxSummary_js"] = (
f"https://chem.echa.europa.eu/{rmlId}/dossier-view/{assetExternalId}/{UIC}"
)
if repeated_dose_found:
results["RepeatedDose"] = (
"https://chem.echa.europa.eu/html-pages/"
+ assetExternalId
+ "/documents/"
+ repeated_dose_id
+ ".html"
)
results["RepeatedDose_js"] = (
f"https://chem.echa.europa.eu/{rmlId}/dossier-view/{assetExternalId}/{repeated_dose_id}"
)
json_formatted_str = json.dumps(results)
logging.info(f"echaFind.search_dossier() OK. output: {json_formatted_str}")
return results