223 lines
No EOL
9.3 KiB
Python
223 lines
No EOL
9.3 KiB
Python
import requests
|
|
import urllib.parse
|
|
import re as standardre
|
|
import json
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
from pif_compiler.functions.common_log import get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
# Funzione per cercare il dossier dato in input un CAS, una sostanza o un EN
|
|
def search_dossier(substance, input_type='rmlCas'):
|
|
results = {}
|
|
# Il dizionario che farò tornare alla fine
|
|
|
|
# Prima parte. Ottengo rmlID e rmlName
|
|
# st.code('https://chem.echa.europa.eu/api-substance/v1/substance?pageIndex=1&pageSize=100&searchText='+substance)
|
|
req_0 = requests.get(
|
|
"https://chem.echa.europa.eu/api-substance/v1/substance?pageIndex=1&pageSize=100&searchText="
|
|
+ urllib.parse.quote(substance) #va convertito per il web
|
|
)
|
|
|
|
logger.info(f'echaFind.search_dossier(). searching "{substance}"')
|
|
|
|
#'La prima cosa da fare è fare una ricerca con il nome della sostanza ma trasformata attraverso urllib'
|
|
req_0_json = req_0.json()
|
|
try:
|
|
# Estraggo i campi che mi servono dalla response
|
|
rmlId = req_0_json["items"][0]["substanceIndex"]["rmlId"]
|
|
rmlName = req_0_json["items"][0]["substanceIndex"]["rmlName"]
|
|
rmlCas = req_0_json["items"][0]["substanceIndex"]["rmlCas"]
|
|
rmlEc = req_0_json["items"][0]["substanceIndex"]["rmlEc"]
|
|
|
|
results['search_response'] = f"https://chem.echa.europa.eu/api-substance/v1/substance?pageIndex=1&pageSize=100&searchText={urllib.parse.quote(substance)}"
|
|
results["rmlId"] = rmlId
|
|
results["rmlName"] = rmlName
|
|
results["rmlCas"] = rmlCas
|
|
results["rmlEc"] = rmlEc
|
|
|
|
logger.info(
|
|
f"echaFind.search_dossier(). found substance on ECHA. rmlId: '{rmlId}', rmlName: '{rmlName}', rmlCas: '{rmlCas}'"
|
|
)
|
|
except:
|
|
logger.info(
|
|
f"echaFind.search_dossier(). could not find substance for '{substance}'"
|
|
)
|
|
return False
|
|
|
|
# Update: in certi casi poteva verificarsi che inserendo un CAS si trovasse invece una sostanza con codice EN uguale al CAS in input.
|
|
# Ora controllo che la sostanza trovata abbia effettivamente un CAS uguale a quello inserito in input.
|
|
# è inoltre possibile cercare per rmlName (nome della sostanza) o EN (rmlEn): basta specificare in input_type per cosa si sta cercando
|
|
if results[input_type] != substance:
|
|
logger.error(f'echa.echaFind.search_dossier(): results[{input_type}] "{results[input_type]}is not equal to "{substance}". ')
|
|
return f'search_error. results[{input_type}] ("{results[input_type]}") is not equal to "{substance}". Maybe you specified the wrong input_type. Check the results here: https://chem.echa.europa.eu/api-substance/v1/substance?pageIndex=1&pageSize=100&searchText={urllib.parse.quote(substance)}'
|
|
|
|
# Seconda parte. Cerco sul sito ECHA dei dossiers creando un link con l'ID precedentemente ottenuto.
|
|
req_1_url = (
|
|
"https://chem.echa.europa.eu/api-dossier-list/v1/dossier?pageIndex=1&pageSize=100&rmlId="
|
|
+ rmlId
|
|
+ "®istrationStatuses=Active"
|
|
) # Prima cerco negli active.
|
|
|
|
req_1 = requests.get(req_1_url)
|
|
req_1_json = req_1.json()
|
|
|
|
# Se non esistono dossiers attivi cerco quelli inattivi
|
|
if req_1_json["items"] == []:
|
|
logger.info(
|
|
f"echaFind.search_dossier(). could not find active dossier for '{substance}'. Proceeding to search in the unactive ones."
|
|
)
|
|
req_1_url = (
|
|
"https://chem.echa.europa.eu/api-dossier-list/v1/dossier?pageIndex=1&pageSize=100&rmlId="
|
|
+ rmlId
|
|
+ "®istrationStatuses=Inactive"
|
|
)
|
|
req_1 = requests.get(req_1_url)
|
|
req_1_json = req_1.json()
|
|
if req_1_json["items"] == []:
|
|
logger.info(
|
|
f"echaFind.search_dossier(). could not find unactive dossiers for '{substance}'"
|
|
) # Non ho trovato nè dossiers inattivi che attivi
|
|
return False
|
|
else:
|
|
logger.info(
|
|
f"echaFind.search_dossier(). found unactive dossiers for '{rmlName}'"
|
|
)
|
|
results["dossierType"] = "Inactive"
|
|
|
|
else:
|
|
logger.info(
|
|
f"echaFind.search_dossier(). found active dossiers for '{substance}'"
|
|
)
|
|
results["dossierType"] = "Active"
|
|
|
|
# Queste erano le due robe che mi servivano
|
|
assetExternalId = req_1_json["items"][0]["assetExternalId"]
|
|
|
|
# UPDATE: Per ottenere la data dell'ultima modifica: serve per capire se abbiamo già dei file aggiornati scaricati in locale
|
|
# confrontare data di scraping e ultimo aggiornato (se prima o dopo)
|
|
|
|
try:
|
|
lastUpdateDate = req_1_json["items"][0]["lastUpdatedDate"]
|
|
datetime_object = datetime.fromisoformat(lastUpdateDate.replace('Z', '+00:00')) # Handle 'Z' if present, else it might break on older python versions
|
|
lastUpdateDate = datetime_object.date().isoformat()
|
|
results['lastUpdateDate'] = lastUpdateDate
|
|
except:
|
|
logger.error(f"echa.echaFind(). Could not find lastUpdateDate for the dossier")
|
|
|
|
rootKey = req_1_json["items"][0]["rootKey"]
|
|
|
|
# PARTE DI HTML
|
|
|
|
# Terza parte. Ottengo assetExternalId
|
|
# "Con l'assetExternalId è possibile arrivare alla pagina principale del dossier."
|
|
# "Da questa pagina bisogna scrappare l'ID del riassunto tossicologico, :red[**SE ESISTE**]"
|
|
results["index"] = (
|
|
"https://chem.echa.europa.eu/html-pages" + assetExternalId + "/index.html"
|
|
)
|
|
results["index_js"] = (
|
|
f"https://chem.echa.europa.eu/{rmlId}/dossier-view/{assetExternalId}"
|
|
)
|
|
|
|
req_2 = requests.get(
|
|
"https://chem.echa.europa.eu/html-pages/" + assetExternalId + "/index.html"
|
|
)
|
|
index = BeautifulSoup(req_2.text, "html.parser")
|
|
index.prettify()
|
|
|
|
# Quarta parte. Ottengo l'ID del riassunto tossicologico dall'index.html
|
|
# "In tutto quell'HTML ci interessa solo un div. BeautifulSoup ha problemi se ci sono troppi div innestati. Quindi uso una combinazione di quello e di Regex"
|
|
|
|
div = index.find_all("div", id=["id_7_Toxicologicalinformation"])
|
|
str_div = str(div)
|
|
str_div = str_div.split("</div>")[0]
|
|
|
|
# UIC è l'id del toxsummary
|
|
uic_found = False
|
|
if type(standardre.search('href="([^"]+)"', str_div)).__name__ == "NoneType":
|
|
# Un regex per trovare l'href che mi serve
|
|
logger.info(
|
|
f"echaFind.search_dossier(). Could not find 'id_7_Toxicologicalinformation' in the body"
|
|
)
|
|
else:
|
|
UIC = standardre.search('href="([^"]+)"', str_div).group(1)
|
|
uic_found = True
|
|
|
|
# Per l'acute toxicity
|
|
acute_toxicity_found = False
|
|
div_acute_toxicity = index.find_all("div", id=["id_72_AcuteToxicity"])
|
|
if div_acute_toxicity:
|
|
for div in div_acute_toxicity:
|
|
try:
|
|
a = div.find_all("a", href=True)[0]
|
|
acute_toxicity_id = standardre.search('href="([^"]+)"', str(a)).group(1)
|
|
acute_toxicity_found = True
|
|
except:
|
|
logger.info(
|
|
f"echaFind.search_dossier(). No acute_toxicity_id found from index for {substance}"
|
|
)
|
|
|
|
# Per il repeated dose
|
|
repeated_dose_found = False
|
|
div_repeated_dose = index.find_all("div", id=["id_75_Repeateddosetoxicity"])
|
|
if div_repeated_dose:
|
|
for div in div_repeated_dose:
|
|
try:
|
|
a = div.find_all("a", href=True)[0]
|
|
repeated_dose_id = standardre.search('href="([^"]+)"', str(a)).group(1)
|
|
repeated_dose_found = True
|
|
except:
|
|
logger.info(
|
|
f"echaFind.search_dossier(). No repeated_dose_id found from index for {substance}"
|
|
)
|
|
|
|
# Quinta parte. Recupero l'html del dossier tossicologico e faccio ritornare il content
|
|
|
|
if acute_toxicity_found:
|
|
acute_toxicity_link = (
|
|
"https://chem.echa.europa.eu/html-pages/"
|
|
+ assetExternalId
|
|
+ "/documents/"
|
|
+ acute_toxicity_id
|
|
+ ".html"
|
|
)
|
|
results["AcuteToxicity"] = acute_toxicity_link
|
|
# ci sono due link diversi: uno solo html brutto ma che ha le info leggibile, mentre js è la versione più bella presentata all'utente,
|
|
# usata per creare il pdf carino
|
|
results["AcuteToxicity_js"] = (
|
|
f"https://chem.echa.europa.eu/{rmlId}/dossier-view/{assetExternalId}/{acute_toxicity_id}"
|
|
)
|
|
|
|
if uic_found:
|
|
# UIC è l'id del toxsummary
|
|
final_url = (
|
|
"https://chem.echa.europa.eu/html-pages/"
|
|
+ assetExternalId
|
|
+ "/documents/"
|
|
+ UIC
|
|
+ ".html"
|
|
)
|
|
results["ToxSummary"] = final_url
|
|
results["ToxSummary_js"] = (
|
|
f"https://chem.echa.europa.eu/{rmlId}/dossier-view/{assetExternalId}/{UIC}"
|
|
)
|
|
|
|
if repeated_dose_found:
|
|
results["RepeatedDose"] = (
|
|
"https://chem.echa.europa.eu/html-pages/"
|
|
+ assetExternalId
|
|
+ "/documents/"
|
|
+ repeated_dose_id
|
|
+ ".html"
|
|
)
|
|
results["RepeatedDose_js"] = (
|
|
f"https://chem.echa.europa.eu/{rmlId}/dossier-view/{assetExternalId}/{repeated_dose_id}"
|
|
)
|
|
|
|
json_formatted_str = json.dumps(results)
|
|
logger.info(f"echaFind.search_dossier() OK. output: {json_formatted_str}")
|
|
return results
|
|
|
|
if __name__ == "__main__":
|
|
search_dossier("100-41-4", input_type='rmlCas') |