This commit is contained in:
adish-rmr 2025-11-11 22:12:25 +01:00
parent fcc8123966
commit 4cabf6fa11
4 changed files with 44 additions and 52 deletions

0
README.md Normal file
View file

View file

@ -14,30 +14,6 @@ Modules:
"""
# ECHA Services
from pif_compiler.services.echa_find import (
search_dossier,
)
from pif_compiler.services.echa_process import (
echaExtract,
echaExtract_multi,
echaExtract_specific,
echaExtract_local,
echa_noael_ld50,
echa_noael_ld50_multi,
echaPage_to_md,
openEchaPage,
markdown_to_json_raw,
clean_json,
json_to_dataframe,
filter_dataframe_by_dict,
)
from pif_compiler.services.echa_pdf import (
generate_pdf_with_header_and_cleanup,
search_generate_pdfs,
svg_to_data_uri,
)
# COSING Service
from pif_compiler.services.cosing_service import (
@ -62,25 +38,6 @@ from pif_compiler.services.db_utils import get_client
__all__ = [
# ECHA Find
"search_dossier",
# ECHA Process
"echaExtract",
"echaExtract_multi",
"echaExtract_specific",
"echaExtract_local",
"echa_noael_ld50",
"echa_noael_ld50_multi",
"echaPage_to_md",
"openEchaPage",
"markdown_to_json_raw",
"clean_json",
"json_to_dataframe",
"filter_dataframe_by_dict",
# ECHA PDF
"generate_pdf_with_header_and_cleanup",
"search_generate_pdfs",
"svg_to_data_uri",
# COSING Service
"cosing_search",
"clean_cosing",

View file

@ -1,4 +1,5 @@
import os
from urllib.parse import quote_plus
from dotenv import load_dotenv
from pymongo import MongoClient
@ -15,6 +16,9 @@ def get_client():
MONGO_HOST = os.getenv("MONGO_HOST")
MONGO_PORT = os.getenv("MONGO_PORT")
MONGO_PORT = MONGO_PORT
ADMIN_PASSWORD = quote_plus(ADMIN_PASSWORD)
client = MongoClient(
f"mongodb://{ADMIN_USER}:{ADMIN_PASSWORD}@{MONGO_HOST}:{MONGO_PORT}/?authSource=admin",
serverSelectionTimeoutMS=5000
@ -34,4 +38,11 @@ def db_connect(db_name : str = 'toxinfo', collection_name : str = 'substance_ind
logger.error(f"Error connecting to MongoDB: {e}")
return None
return client, db, collection
return collection
if __name__ == "__main__":
coll = db_connect()
if coll:
logger.info("Database connection successful.")
else:
logger.error("Database connection failed.")

View file

@ -46,8 +46,9 @@ def search_substance(cas : str) -> dict:
"rmlName": result["substanceIndex"]["rmlName"],
"rmlId": result["substanceIndex"]["rmlId"]
}
log.info(f"Substance found for CAS {cas}: {substance['rmlName']}")
return substance
log.error(f"Something went wrong")
log.error(f"Something went wrong searching the substance for CAS {cas}")
return {}
@ -69,6 +70,7 @@ def get_dossier_info(rmlId: str) -> dict:
"assetExternalId": response_dossier_json['items'][0]['assetExternalId'],
"rootKey": response_dossier_json['items'][0]['rootKey']
}
log.info(f"Dossier info retrieved for RML ID {rmlId}")
return dossier_info
@ -105,6 +107,8 @@ def get_substance_index(assetExternalId : str) -> dict:
at_href = at_link['href']
index_data['acute_toxicity_link'] = LINK_DOSSIER + at_href + '.html'
log.info(f"Substance index retrieved for Asset External ID {assetExternalId}")
return index_data
#endregion
@ -303,6 +307,12 @@ def generate_pdf_from_toxicology_info(index: dict):
page.goto(index['toxicological_information_link'])
page.pdf(path=f'pdfs/{index["substance"]["rmlCas"]}.pdf')
browser.close()
if os.path.exists(f'pdfs/{index["substance"]["rmlCas"]}.pdf'):
log.info(f"PDF generated for CAS {index['substance']['rmlCas']}")
return True
else:
log.error(f"PDF generation failed for CAS {index['substance']['rmlCas']}")
return False
#endregion
@ -326,6 +336,8 @@ def echa_flow(cas) -> dict:
"repeated_dose_toxicity": {}
}
log.debug(f"ECHA flow intermediate result")
# Fetch and parse toxicological information
txi_link = index.get('toxicological_information_link')
if txi_link:
@ -349,7 +361,9 @@ def echa_flow(cas) -> dict:
for key, value in result.items():
if value is None or value == "" or value == [] or value == {}:
return False
log.warning(f"Missing data for key: {key} in CAS {cas}")
else:
log.info(f"Data retrieved for key: {key} in CAS {cas}")
return result
def cas_validation(cas: str) -> str:
@ -367,9 +381,9 @@ def cas_validation(cas: str) -> str:
return None
def check_local(cas: str) -> bool:
client, db, collection = db_connect()
collection = db_connect()
if not collection:
if collection is None:
log.error("No MongoDB collection available.")
return None
@ -383,9 +397,9 @@ def check_local(cas: str) -> bool:
return None
def add_to_local(data: dict) -> bool:
client, db, collection = db_connect()
collection = db_connect()
if not collection:
if collection is None:
log.error("No MongoDB collection available.")
return False
@ -397,17 +411,22 @@ def add_to_local(data: dict) -> bool:
log.error(f"Error inserting data into MongoDB: {e}")
return False
def search_substance(cas: str) -> dict:
def orchestrator(cas: str) -> dict:
log.debug(f"Initiating search for CAS {cas} in ECHA service.")
cas_validated = cas_validation(cas)
if not cas_validated:
return None
else:
log.info(f"CAS {cas} validated successfully.")
local_record = check_local(cas_validated)
if local_record:
log.info(f"Returning local record for CAS {cas}.")
return local_record
else:
log.info(f"No local record, starting echa flow")
echa_data = echa_flow(cas_validated)
if echa_data:
log.info(f"Echa flow successful")
add_to_local(echa_data)
return echa_data
else:
@ -417,3 +436,8 @@ def search_substance(cas: str) -> dict:
# to do: check if document is complete
# to do: check lastupdate
#endregion
if __name__ == "__main__":
cas_test = "50-00-0"
result = orchestrator(cas_test)
print(result)