From bba5c11bc93e5d3ae92b13dc3f4fdb0185bb0a49 Mon Sep 17 00:00:00 2001 From: adish-rmr Date: Thu, 26 Feb 2026 16:44:45 +0100 Subject: [PATCH] scadenza, caching e log --- scripts/create_mock_order.py | 246 ---------------------- src/pif_compiler/classes/models.py | 31 ++- src/pif_compiler/functions/common_func.py | 5 +- src/pif_compiler/services/srv_echa.py | 126 ++++------- 4 files changed, 75 insertions(+), 333 deletions(-) delete mode 100644 scripts/create_mock_order.py diff --git a/scripts/create_mock_order.py b/scripts/create_mock_order.py deleted file mode 100644 index 67adaa4..0000000 --- a/scripts/create_mock_order.py +++ /dev/null @@ -1,246 +0,0 @@ -""" -Script per creare un ordine mock con 4 ingredienti per testare la UI. -Inserisce direttamente nei database senza passare dalla pipeline (no scraping). - -Uso: uv run python scripts/create_mock_order.py -""" - -import sys -import os - -# Aggiungi il path del progetto -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) - -from pif_compiler.functions.db_utils import ( - db_connect, upsert_cliente, insert_ordine, aggiorna_stato_ordine, - update_ordine_cliente, upsert_ingrediente -) -from pif_compiler.classes.models import ( - StatoOrdine, Ingredient, DapInfo, CosingInfo, ToxIndicator, Toxicity, Esposition -) -from pif_compiler.classes.main_workflow import Project, ProjectIngredient - - -def ensure_preset_exists(preset_name="Test Preset"): - """Verifica che il preset esista, altrimenti lo crea.""" - preset = Esposition.get_by_name(preset_name) - if preset: - print(f"Preset '{preset_name}' già esistente") - return preset - - print(f"Creazione preset '{preset_name}'...") - preset = Esposition( - preset_name=preset_name, - tipo_prodotto="Crema corpo", - luogo_applicazione="Corpo", - esp_normali=["Dermal"], - esp_secondarie=["Oral"], - esp_nano=[], - sup_esposta=15670, - freq_applicazione=1, - qta_giornaliera=7.82, - ritenzione=1.0 - ) - result = preset.save_to_postgres() - if result: - print(f"Preset creato con id_preset={result}") - else: - print("ERRORE: impossibile creare il preset") - sys.exit(1) - return preset - - -def create_mock_ingredients(): - """Crea ingredienti mock con dati finti di tossicologia e DAP.""" - - # GLYCERIN (56-81-5) — con NOAEL - glycerin = Ingredient( - cas="56-81-5", - inci=["GLYCERIN"], - dap_info=DapInfo( - cas="56-81-5", - molecular_weight=92.09, - log_pow=-1.76, - tpsa=60.69, - melting_point=18.0 - ), - cosing_info=[CosingInfo( - cas=["56-81-5"], - common_names=["Glycerol"], - inci=["GLYCERIN"], - annex=[], - functionName=["Humectant", "Solvent", "Skin conditioning"], - otherRestrictions=[], - cosmeticRestriction=None - )], - toxicity=Toxicity( - cas="56-81-5", - indicators=[ - ToxIndicator( - indicator="NOAEL", value=1000, unit="mg/kg bw/day", - route="oral", toxicity_type="repeated_dose_toxicity", - ref="https://chem.echa.europa.eu/100.003.264" - ), - ToxIndicator( - indicator="LD50", value=12600, unit="mg/kg bw", - route="oral", toxicity_type="acute_toxicity", - ref="https://chem.echa.europa.eu/100.003.264" - ) - ] - ) - ) - - # CETYL ALCOHOL (36653-82-4) — con NOAEL - cetyl = Ingredient( - cas="36653-82-4", - inci=["CETYL ALCOHOL"], - dap_info=DapInfo( - cas="36653-82-4", - molecular_weight=242.44, - log_pow=6.83, - tpsa=20.23, - melting_point=49.0 - ), - cosing_info=[CosingInfo( - cas=["36653-82-4"], - common_names=["Cetyl alcohol", "1-Hexadecanol"], - inci=["CETYL ALCOHOL"], - annex=[], - functionName=["Emollient", "Emulsifying", "Opacifying"], - otherRestrictions=[], - cosmeticRestriction=None - )], - toxicity=Toxicity( - cas="36653-82-4", - indicators=[ - ToxIndicator( - indicator="NOAEL", value=1000, unit="mg/kg bw/day", - route="oral", toxicity_type="repeated_dose_toxicity", - ref="https://chem.echa.europa.eu/100.004.098" - ) - ] - ) - ) - - # TOCOPHEROL (59-02-9) — con LOAEL - tocopherol = Ingredient( - cas="59-02-9", - inci=["TOCOPHEROL"], - dap_info=DapInfo( - cas="59-02-9", - molecular_weight=430.71, - log_pow=10.51, - tpsa=29.46, - melting_point=3.0 - ), - cosing_info=[CosingInfo( - cas=["59-02-9"], - common_names=["alpha-Tocopherol"], - inci=["TOCOPHEROL"], - annex=[], - functionName=["Antioxidant", "Skin conditioning"], - otherRestrictions=[], - cosmeticRestriction=None - )], - toxicity=Toxicity( - cas="59-02-9", - indicators=[ - ToxIndicator( - indicator="LOAEL", value=500, unit="mg/kg bw/day", - route="oral", toxicity_type="repeated_dose_toxicity", - ref="https://chem.echa.europa.eu/100.000.375" - ) - ] - ) - ) - - # Salva ogni ingrediente su MongoDB + PostgreSQL - for ing in [glycerin, cetyl, tocopherol]: - mongo_id = ing.save() - print(f"Ingrediente {ing.cas} ({ing.inci[0]}) salvato (mongo_id={mongo_id})") - - return glycerin, cetyl, tocopherol - - -def create_mock_order(preset, glycerin, cetyl, tocopherol): - """Crea un ordine mock completo.""" - - # 1. Upsert cliente - client_name = "Cosmetica Test Srl" - id_cliente = upsert_cliente(client_name) - print(f"Cliente '{client_name}' → id_cliente={id_cliente}") - - # 2. JSON ordine grezzo - raw_json = { - "client_name": client_name, - "product_name": "Crema Idratante Test", - "preset_esposizione": preset.preset_name, - "ingredients": [ - {"inci": "AQUA", "cas": "", "percentage": 70.0, "is_colorante": False, "skip_tox": True}, - {"inci": "GLYCERIN", "cas": "56-81-5", "percentage": 15.0, "is_colorante": False, "skip_tox": False}, - {"inci": "CETYL ALCOHOL", "cas": "36653-82-4", "percentage": 10.0, "is_colorante": False, "skip_tox": False}, - {"inci": "TOCOPHEROL", "cas": "59-02-9", "percentage": 5.0, "is_colorante": False, "skip_tox": False}, - ] - } - - # 3. Salva su MongoDB orders - orders_col = db_connect(collection_name='orders') - result = orders_col.insert_one(raw_json.copy()) - uuid_ordine = str(result.inserted_id) - print(f"Ordine salvato su MongoDB: uuid_ordine={uuid_ordine}") - - # 4. Inserisci in PostgreSQL ordini - id_ordine = insert_ordine(uuid_ordine, id_cliente) - print(f"Ordine inserito in PostgreSQL: id_ordine={id_ordine}") - - # 5. Aggiorna stato a ARRICCHITO - update_ordine_cliente(id_ordine, id_cliente) - aggiorna_stato_ordine(id_ordine, int(StatoOrdine.ARRICCHITO)) - print(f"Stato ordine aggiornato a ARRICCHITO ({StatoOrdine.ARRICCHITO})") - - # 6. Crea progetto con ingredienti arricchiti - project = Project( - order_id=id_ordine, - product_name="Crema Idratante Test", - client_name=client_name, - esposition=preset, - ingredients=[ - ProjectIngredient(cas=None, inci="AQUA", percentage=70.0, skip_tox=True), - ProjectIngredient(cas="56-81-5", inci="GLYCERIN", percentage=15.0, ingredient=glycerin), - ProjectIngredient(cas="36653-82-4", inci="CETYL ALCOHOL", percentage=10.0, ingredient=cetyl), - ProjectIngredient(cas="59-02-9", inci="TOCOPHEROL", percentage=5.0, ingredient=tocopherol), - ] - ) - - # 7. Salva il progetto (MongoDB + PostgreSQL) - uuid_progetto = project.save() - print(f"Progetto salvato: uuid_progetto={uuid_progetto}") - - print("\n" + "=" * 60) - print("MOCK ORDER CREATO CON SUCCESSO") - print("=" * 60) - print(f" id_ordine: {id_ordine}") - print(f" uuid_ordine: {uuid_ordine}") - print(f" uuid_progetto: {uuid_progetto}") - print(f" cliente: {client_name}") - print(f" prodotto: Crema Idratante Test") - print(f" preset: {preset.preset_name}") - print(f" ingredienti: 4 (AQUA, GLYCERIN, CETYL ALCOHOL, TOCOPHEROL)") - print(f" stato: ARRICCHITO ({StatoOrdine.ARRICCHITO})") - print("=" * 60) - - return id_ordine - - -if __name__ == "__main__": - print("Creazione ordine mock...") - print() - - # 1. Assicura che il preset esista - preset = ensure_preset_exists() - - # 2. Crea ingredienti mock - glycerin, cetyl, tocopherol = create_mock_ingredients() - - # 3. Crea l'ordine - create_mock_order(preset, glycerin, cetyl, tocopherol) diff --git a/src/pif_compiler/classes/models.py b/src/pif_compiler/classes/models.py index 4fdc5d9..17a44bd 100644 --- a/src/pif_compiler/classes/models.py +++ b/src/pif_compiler/classes/models.py @@ -127,6 +127,7 @@ class CosingInfo(BaseModel): otherRestrictions : List[str] = Field(default_factory=list) cosmeticRestriction : Optional[str] = None reference : Optional[str] = None + substanceId : Optional[str] = None sccsOpinionUrls : List[str] = Field(default_factory=list) @classmethod @@ -140,6 +141,7 @@ class CosingInfo(BaseModel): 'otherRestrictions', 'cosmeticRestriction', 'reference', + 'substanceId', 'inciName', 'sccsOpinionUrls' ] @@ -185,6 +187,8 @@ class CosingInfo(BaseModel): cosing_dict['cosmeticRestriction'] = cosing_data[k] if k == 'reference': cosing_dict['reference'] = cosing_data[k] + if k == 'substanceId': + cosing_dict['substanceId'] = cosing_data[k] if k == 'sccsOpinionUrls': urls = [] for url in cosing_data[k]: @@ -213,6 +217,7 @@ class ToxIndicator(BaseModel): toxicity_type : Optional[str] = None ref : Optional[str] = None source : Optional[str] = None + is_custom : bool = False @property def priority_rank(self): @@ -392,7 +397,10 @@ class Ingredient(BaseModel): @classmethod def get_or_create(cls, cas: str, inci: Optional[List[str]] = None, force: bool = False): """Restituisce l'ingrediente dalla cache se esiste e non è vecchio, altrimenti lo ricrea. - Se force=True, ignora la cache e riesegue lo scraping aggiornando il documento.""" + Se force=True, ignora la cache e riesegue lo scraping aggiornando il documento. + Al re-scraping, i campi che risultano None vengono sostituiti con il valore cached + per evitare regressioni di dati in caso di fallimenti temporanei delle fonti esterne.""" + cached = None if not force: cached = cls.from_cas(cas) if cached and not cached.is_old(): @@ -405,6 +413,26 @@ class Ingredient(BaseModel): logger.info(f"get_or_create CAS={cas}: force refresh") ingredient = cls.ingredient_builder(cas, inci=inci) + + if cached: + if ingredient.dap_info is None and cached.dap_info is not None: + logger.warning(f"get_or_create CAS={cas}: dap_info non ottenuto, mantengo dati cached") + ingredient.dap_info = cached.dap_info + if ingredient.cosing_info is None and cached.cosing_info is not None: + logger.warning(f"get_or_create CAS={cas}: cosing_info non ottenuto, mantengo dati cached") + ingredient.cosing_info = cached.cosing_info + if ingredient.toxicity is None and cached.toxicity is not None: + logger.warning(f"get_or_create CAS={cas}: toxicity non ottenuta, mantengo dati cached") + ingredient.toxicity = cached.toxicity + elif ingredient.toxicity is not None and cached.toxicity is not None: + custom_indicators = [i for i in cached.toxicity.indicators if i.is_custom] + if custom_indicators: + logger.info(f"get_or_create CAS={cas}: preservo {len(custom_indicators)} indicatori custom nel re-scraping") + ingredient.toxicity = Toxicity( + cas=ingredient.toxicity.cas, + indicators=ingredient.toxicity.indicators + custom_indicators + ) + ingredient.save() return ingredient @@ -452,6 +480,7 @@ class Ingredient(BaseModel): def add_tox_indicator(self, indicator: ToxIndicator): """Aggiunge un indicatore tossicologico custom e ricalcola il best_case.""" + indicator.is_custom = True if self.toxicity is None: self.toxicity = Toxicity(cas=self.cas, indicators=[indicator]) else: diff --git a/src/pif_compiler/functions/common_func.py b/src/pif_compiler/functions/common_func.py index 7627949..333dac4 100644 --- a/src/pif_compiler/functions/common_func.py +++ b/src/pif_compiler/functions/common_func.py @@ -120,7 +120,8 @@ async def generate_project_source_pdfs(project, output_dir: str = "pdfs") -> lis # --- Tox best_case PDF --- best = ing.toxicity.best_case if ing.toxicity else None if best and best.ref: - pdf_name = f"{pi.cas}_{best.source}" if best.source else pi.cas + source_label = best.source or best.toxicity_type or "tox" + pdf_name = f"{pi.cas}_{source_label}" log.info(f"Generazione PDF tox: {pdf_name} da {best.ref}") success = await generate_pdf(best.ref, pdf_name) if success: @@ -143,7 +144,7 @@ async def generate_project_source_pdfs(project, output_dir: str = "pdfs") -> lis generated.append(pdf_path) continue - log.info(f"Download COSING PDF: {pdf_name} (ref={cosing.reference})") + log.info(f"Download COSING PDF: {pdf_name} (reference={cosing.reference})") content = cosing_download(cosing.reference) if isinstance(content, bytes): with open(pdf_path, 'wb') as f: diff --git a/src/pif_compiler/services/srv_echa.py b/src/pif_compiler/services/srv_echa.py index 9262956..2aa0039 100644 --- a/src/pif_compiler/services/srv_echa.py +++ b/src/pif_compiler/services/srv_echa.py @@ -9,7 +9,7 @@ from playwright.sync_api import sync_playwright from typing import Callable, Any from pif_compiler.functions.common_log import get_logger -from pif_compiler.functions.db_utils import db_connect, log_ricerche +from pif_compiler.functions.db_utils import log_ricerche log = get_logger() load_dotenv() @@ -30,12 +30,12 @@ legislation = "&legislation=REACH" def search_substance(cas : str) -> dict: response = requests.get(BASE_SEARCH + cas) if response.status_code != 200: - log.error(f"Network error: {response.status_code}") + log.error(f"search_substance CAS={cas}: HTTP {response.status_code}") return {} else: response = response.json() if response['state']['totalItems'] == 0: - log.info(f"No substance found for CAS {cas}") + log.warning(f"search_substance CAS={cas}: nessuna sostanza trovata su ECHA") return {} else: for result in response['items']: @@ -47,9 +47,9 @@ def search_substance(cas : str) -> dict: "rmlName": result["substanceIndex"]["rmlName"], "rmlId": result["substanceIndex"]["rmlId"] } - log.info(f"Substance found for CAS {cas}: {substance['rmlName']}") + log.debug(f"search_substance CAS={cas}: trovata '{substance['rmlName']}'") return substance - log.error(f"Something went wrong searching the substance for CAS {cas}") + log.warning(f"search_substance CAS={cas}: {response['state']['totalItems']} risultati ma nessun match esatto sul CAS") return {} @@ -57,14 +57,16 @@ def get_dossier_info(rmlId: str, type = active) -> dict: url = BASE_DOSSIER + rmlId + type + legislation response_dossier = requests.get(url) if response_dossier.status_code != 200: - log.error(f"Network error: {response_dossier.status_code}") + log.error(f"get_dossier_info rmlId={rmlId}: HTTP {response_dossier.status_code}") return {} response_dossier_json = response_dossier.json() if response_dossier_json['state']['totalItems'] == 0: - log.info(f"No dossier found for RML ID {rmlId}") if type == active: + log.debug(f"get_dossier_info rmlId={rmlId}: nessun dossier attivo, provo inattivi") return get_dossier_info(rmlId, inactive) + log.warning(f"get_dossier_info rmlId={rmlId}: nessun dossier trovato (né attivo né inattivo)") return {} + dossier_info = {} for dossier in response_dossier_json['items']: if dossier['reachDossierInfo']['dossierSubtype'] == "Article 10 - full" and dossier['reachDossierInfo']['registrationRole'] == "Lead (joint submission)": dossier_info = { @@ -75,7 +77,8 @@ def get_dossier_info(rmlId: str, type = active) -> dict: "assetExternalId": dossier['assetExternalId'], "rootKey": dossier['rootKey'] } - log.info(f"Dossier info retrieved for RML ID {rmlId}") + if not dossier_info: + log.warning(f"get_dossier_info rmlId={rmlId}: nessun dossier 'Article 10 - full / Lead' tra i {response_dossier_json['state']['totalItems']} trovati") return dossier_info @@ -85,7 +88,7 @@ def get_substance_index(assetExternalId : str) -> dict: response = requests.get(INDEX + "/index.html") if response.status_code != 200: - log.error(f"Network error: {response.status_code}") + log.error(f"get_substance_index {assetExternalId}: HTTP {response.status_code}") return {} soup = BeautifulSoup(response.content, 'html.parser') @@ -98,9 +101,9 @@ def get_substance_index(assetExternalId : str) -> dict: txi_href = txi_link['href'] index_data['toxicological_information_link'] = LINK_DOSSIER + txi_href + '.html' except Exception as e: - log.error(f"Error retrieving toxicological information link: {e}") + log.warning(f"get_substance_index: link tossicologia non trovato — {e}") index_data['toxicological_information_link'] = None - + # Repeated dose toxicity : rdt try: rdt_div = soup.find('div', id='id_75_Repeateddosetoxicity') @@ -108,7 +111,7 @@ def get_substance_index(assetExternalId : str) -> dict: rdt_href = rdt_link['href'] index_data['repeated_dose_toxicity_link'] = LINK_DOSSIER + rdt_href + '.html' except Exception as e: - log.error(f"Error retrieving repeated dose toxicity link: {e}") + log.warning(f"get_substance_index: link repeated dose non trovato — {e}") index_data['repeated_dose_toxicity_link'] = None # Acute toxicity : at @@ -118,11 +121,9 @@ def get_substance_index(assetExternalId : str) -> dict: at_href = at_link['href'] index_data['acute_toxicity_link'] = LINK_DOSSIER + at_href + '.html' except Exception as e: - log.error(f"Error retrieving acute toxicity link: {e}") + log.warning(f"get_substance_index: link acute toxicity non trovato — {e}") index_data['acute_toxicity_link'] = None - log.info(f"Substance index retrieved for Asset External ID {assetExternalId}") - return index_data @@ -429,8 +430,8 @@ def echa_flow(cas) -> dict: substance = search_substance(cas) dossier_info = get_dossier_info(substance['rmlId']) index = get_substance_index(dossier_info['assetExternalId']) - except Exception as e: - log.error(f"Error in ECHA flow for CAS {cas}: {e}") + except KeyError as e: + log.error(f"echa_flow CAS={cas}: chiave mancante nella risposta ECHA — {e}") return {} result = { @@ -442,14 +443,14 @@ def echa_flow(cas) -> dict: "repeated_dose_toxicity": {} } - log.debug(f"ECHA flow intermediate result") - # Fetch and parse toxicological information txi_link = index.get('toxicological_information_link') if txi_link: response_summary = requests.get(txi_link) if response_summary.status_code == 200: result['toxicological_information'] = parse_toxicology_html(response_summary.content) + else: + log.warning(f"echa_flow CAS={cas}: tossicologia HTTP {response_summary.status_code}") # Fetch and parse acute toxicity at_link = index.get('acute_toxicity_link') @@ -457,6 +458,8 @@ def echa_flow(cas) -> dict: response_acute = requests.get(at_link) if response_acute.status_code == 200: result['acute_toxicity'] = parse_toxicology_html(response_acute.content) + else: + log.warning(f"echa_flow CAS={cas}: acute toxicity HTTP {response_acute.status_code}") # Fetch and parse repeated dose toxicity rdt_link = index.get('repeated_dose_toxicity_link') @@ -464,86 +467,41 @@ def echa_flow(cas) -> dict: response_repeated = requests.get(rdt_link) if response_repeated.status_code == 200: result['repeated_dose_toxicity'] = parse_toxicology_html(response_repeated.content) - - for key, value in result.items(): - if value is None or value == "" or value == [] or value == {}: - log.warning(f"Missing data for key: {key} in CAS {cas}") else: - log.info(f"Data retrieved for key: {key} in CAS {cas}") + log.warning(f"echa_flow CAS={cas}: repeated dose HTTP {response_repeated.status_code}") + + txi_ok = bool(result['toxicological_information']) + at_ok = bool(result['acute_toxicity']) + rdt_ok = bool(result['repeated_dose_toxicity']) + log.info(f"echa_flow CAS={cas}: txi={'OK' if txi_ok else '-'}, acute={'OK' if at_ok else '-'}, rdt={'OK' if rdt_ok else '-'}") + return result def cas_validation(cas: str) -> str: - log.info(f"Starting ECHA data extraction for CAS: {cas}") if cas is None or cas.strip() == "": - log.error("No CAS number provided.") + log.error("cas_validation: CAS vuoto o None") return None - cas_stripped = cas.replace("-", "") if cas_stripped.isdigit() and len(cas_stripped) <= 12: - log.info(f"CAS number {cas} maybe is valid.") return cas.strip() - else: - log.error(f"CAS number {cas} is not valid.") - return None + log.error(f"cas_validation: CAS '{cas}' non valido (formato non riconosciuto)") + return None -def check_local(cas: str) -> bool: - collection = db_connect() - - if collection is None: - log.error("No MongoDB collection available.") - return None - - record = collection.find_one({"substance.rmlCas": cas}) - - if record: - log.info(f"Record for CAS {cas} found in local database.") - return record - else: - log.info(f"No record for CAS {cas} found in local database.") - return None - -def add_to_local(data: dict) -> bool: - collection = db_connect() - - if collection is None: - log.error("No MongoDB collection available.") - return False - - try: - collection.insert_one(data) - log.info(f"Data for CAS {data['substance']['rmlCas']} added to local database.") - return True - except Exception as e: - log.error(f"Error inserting data into MongoDB: {e}") - return False - def orchestrator(cas: str) -> dict: - log.debug(f"Initiating search for CAS {cas} in ECHA service.") + log.debug(f"ECHA orchestrator CAS={cas}") cas_validated = cas_validation(cas) if not cas_validated: return None - else: - log.info(f"CAS {cas} validated successfully.") - local_record = check_local(cas_validated) - if local_record: - log.info(f"Returning local record for CAS {cas}.") - log_ricerche(cas, 'ECHA', True) - return local_record - else: - log.info(f"No local record, starting echa flow") - echa_data = echa_flow(cas_validated) - if echa_data: - log.info(f"Echa flow successful") - log_ricerche(cas, 'ECHA', True) - add_to_local(echa_data) - return echa_data - else: - log.error(f"Failed to retrieve ECHA data for CAS {cas}.") - log_ricerche(cas, 'ECHA', False) - return None -# to do: check if document is complete -# to do: check lastupdate + echa_data = echa_flow(cas_validated) + if echa_data: + log.info(f"ECHA CAS={cas}: completato") + log_ricerche(cas, 'ECHA', True) + return echa_data + else: + log.error(f"ECHA CAS={cas}: nessun dato recuperato") + log_ricerche(cas, 'ECHA', False) + return None #endregion if __name__ == "__main__":