diff --git a/src/pif_compiler/api/routes/cosing.py b/src/pif_compiler/api/routes/api_cosing.py similarity index 100% rename from src/pif_compiler/api/routes/cosing.py rename to src/pif_compiler/api/routes/api_cosing.py diff --git a/src/pif_compiler/api/routes/common.py b/src/pif_compiler/api/routes/common.py new file mode 100644 index 0000000..b3b100f --- /dev/null +++ b/src/pif_compiler/api/routes/common.py @@ -0,0 +1,222 @@ +from fastapi import APIRouter, HTTPException, status +from fastapi.responses import FileResponse +from pydantic import BaseModel, Field, HttpUrl +from typing import Optional, Dict, Any +import os + +from pif_compiler.functions.common_func import generate_pdf +from pif_compiler.services.srv_pubchem import pubchem_dap +from pif_compiler.functions.common_log import get_logger + +logger = get_logger() + +router = APIRouter() + + +class GeneratePdfRequest(BaseModel): + link: str = Field(..., description="URL of the page to convert to PDF") + name: str = Field(..., description="Name for the generated PDF file (without extension)") + + class Config: + json_schema_extra = { + "example": { + "link": "https://example.com/page", + "name": "my_document" + } + } + + +class GeneratePdfResponse(BaseModel): + success: bool + name: str + message: str + file_path: Optional[str] = None + + +@router.post("/common/generate-pdf", response_model=GeneratePdfResponse, tags=["Common"]) +async def generate_pdf_endpoint(request: GeneratePdfRequest): + """ + Generate a PDF from a web page URL. + + This endpoint uses Playwright to: + 1. Navigate to the provided URL + 2. Render the page + 3. Generate a PDF file + 4. Save it in the 'pdfs/' directory + + If a PDF with the same name already exists, it will skip generation + and return success immediately. + + Args: + request: GeneratePdfRequest with the URL and desired PDF name + + Returns: + GeneratePdfResponse with success status and file information + """ + logger.info(f"API request received to generate PDF: name='{request.name}', link='{request.link}'") + + try: + result = generate_pdf(request.link, request.name) + + if result: + file_path = f"pdfs/{request.name}.pdf" + + # Check if file was already existing or newly created + if os.path.exists(file_path): + logger.info(f"PDF available for '{request.name}'") + return GeneratePdfResponse( + success=True, + name=request.name, + message=f"PDF generated successfully or already exists", + file_path=file_path + ) + else: + logger.error(f"PDF file not found after generation for '{request.name}'") + return GeneratePdfResponse( + success=False, + name=request.name, + message="PDF generation completed but file not found", + file_path=None + ) + else: + logger.error(f"PDF generation failed for '{request.name}'") + return GeneratePdfResponse( + success=False, + name=request.name, + message="PDF generation failed", + file_path=None + ) + + except Exception as e: + logger.error(f"Error generating PDF for '{request.name}': {str(e)}", exc_info=True) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Internal error while generating PDF: {str(e)}" + ) + + +@router.get("/common/download-pdf/{name}", response_class=FileResponse, tags=["Common"]) +async def download_pdf(name: str): + """ + Download a previously generated PDF file. + + Args: + name: Name of the PDF file (without extension) + + Returns: + FileResponse with the PDF file for download + """ + logger.info(f"API request received to download PDF: name='{name}'") + + file_path = f"pdfs/{name}.pdf" + + if not os.path.exists(file_path): + logger.warning(f"PDF file not found: {file_path}") + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"PDF file '{name}' not found. Please generate it first using /common/generate-pdf" + ) + + logger.info(f"Serving PDF file: {file_path}") + return FileResponse( + path=file_path, + media_type="application/pdf", + filename=f"{name}.pdf" + ) + + +class PubchemRequest(BaseModel): + cas: str = Field(..., description="CAS number of the substance to search for in PubChem") + + class Config: + json_schema_extra = { + "example": { + "cas": "64-17-5" + } + } + + +class PubchemResponse(BaseModel): + success: bool + cas: str + data: Optional[Dict[str, Any]] = None + error: Optional[str] = None + + +@router.post("/common/pubchem", response_model=PubchemResponse, tags=["Common"]) +async def search_pubchem(request: PubchemRequest): + """ + Search for substance information in PubChem database. + + This endpoint retrieves comprehensive substance data from PubChem including: + - **Basic info**: CID, CAS, first PubChem name, PubChem link + - **First level properties**: XLogP, molecular weight, TPSA, exact mass + - **Second level properties**: Melting Point, Dissociation Constants, pH + + The data is automatically cleaned and formatted for easier consumption. + + Args: + request: PubchemRequest containing the CAS number + + Returns: + PubchemResponse with the substance data or error information + """ + logger.info(f"API request received for PubChem search: CAS={request.cas}") + + try: + result = pubchem_dap(request.cas) + + # Check if result is None (error occurred) + if result is None: + logger.error(f"PubChem search returned None for CAS: {request.cas}") + return PubchemResponse( + success=False, + cas=request.cas, + data=None, + error="An error occurred while searching PubChem. Please check the logs for details." + ) + + # Check if result is a string (no results found) + if isinstance(result, str): + logger.warning(f"No results found in PubChem for CAS: {request.cas}") + return PubchemResponse( + success=False, + cas=request.cas, + data=None, + error=result + ) + + # Successful result + logger.info(f"Successfully retrieved PubChem data for CAS: {request.cas}") + return PubchemResponse( + success=True, + cas=request.cas, + data=result, + error=None + ) + + except Exception as e: + logger.error(f"Error processing PubChem request for CAS {request.cas}: {str(e)}", exc_info=True) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Internal error while processing PubChem request: {str(e)}" + ) + + +@router.get("/common/health", tags=["Common"]) +async def common_health_check(): + """ + Health check endpoint for common functions service. + + Returns the status of the common functions components. + """ + return { + "status": "healthy", + "service": "common-functions", + "components": { + "api": "operational", + "logging": "operational", + "utilities": "operational", + "pubchem": "operational" + } + } diff --git a/src/pif_compiler/functions/common_func.py b/src/pif_compiler/functions/common_func.py new file mode 100644 index 0000000..692c1e5 --- /dev/null +++ b/src/pif_compiler/functions/common_func.py @@ -0,0 +1,25 @@ +from playwright.sync_api import sync_playwright +import os + +from pif_compiler.functions.common_log import get_logger + +log = get_logger() + +def generate_pdf(link : str, name : str): + if os.path.exists(f'pdfs/{name}.pdf'): + log.info(f"PDF already exists for {name}, skipping generation.") + return True + else: + log.info(f"Generating PDF for {name} from link: {link}") + with sync_playwright() as p: + browser = p.chromium.launch() + page = browser.new_page() + page.goto(link) + page.pdf(path=f'pdfs/{name}.pdf') + browser.close() + if os.path.exists(f'pdfs/{name}.pdf'): + log.info(f"PDF generated for {name}") + return True + else: + log.error(f"PDF generation failed for {name}") + return False \ No newline at end of file diff --git a/src/pif_compiler/main.py b/src/pif_compiler/main.py index 92489f5..ffd2ef1 100644 --- a/src/pif_compiler/main.py +++ b/src/pif_compiler/main.py @@ -8,7 +8,7 @@ import time from pif_compiler.functions.common_log import get_logger # Import dei tuoi router -from pif_compiler.api.routes import api_echa +from pif_compiler.api.routes import api_echa, api_cosing, common # Configurazione logging logger = get_logger() @@ -123,6 +123,17 @@ app.include_router( tags=["ECHA"] ) +app.include_router( + api_cosing.router, + prefix="/api/v1", + tags=["COSING"] +) + +app.include_router( + common.router, + prefix="/api/v1", + tags=["Common"] +) # ==================== ROOT ENDPOINTS ==================== diff --git a/src/pif_compiler/services/__init__.py b/src/pif_compiler/services/__init__.py index ba869a3..e1171a6 100644 --- a/src/pif_compiler/services/__init__.py +++ b/src/pif_compiler/services/__init__.py @@ -24,7 +24,7 @@ from pif_compiler.services.srv_cosing import ( ) # PubChem Service -from pif_compiler.services.pubchem_service import ( +from pif_compiler.services.srv_pubchem import ( pubchem_dap, clean_property_data, ) diff --git a/src/pif_compiler/services/pubchem_service.py b/src/pif_compiler/services/pubchem_service.py deleted file mode 100644 index f20f5a7..0000000 --- a/src/pif_compiler/services/pubchem_service.py +++ /dev/null @@ -1,138 +0,0 @@ -import os -from contextlib import contextmanager -import pubchempy as pcp -from pubchemprops.pubchemprops import get_second_layer_props - -from pif_compiler.functions.common_log import get_logger - -logger = get_logger() - -@contextmanager -def temporary_certificate(cert_path): - """ - Context manager to temporarily change the certificate used for requests. - - Args: - cert_path (str): Path to the certificate file to use temporarily - - Example: - # Regular request uses default certificates - requests.get('https://api.example.com') - - # Use custom certificate only within this block - with temporary_certificate('custom-cert.pem'): - requests.get('https://api.requiring.custom.cert.com') - - # Back to default certificates - requests.get('https://api.example.com') - """ - # Store original environment variables - original_ca_bundle = os.environ.get('REQUESTS_CA_BUNDLE') - original_ssl_cert = os.environ.get('SSL_CERT_FILE') - - try: - # Set new certificate - os.environ['REQUESTS_CA_BUNDLE'] = cert_path - os.environ['SSL_CERT_FILE'] = cert_path - yield - finally: - # Restore original environment variables - if original_ca_bundle is not None: - os.environ['REQUESTS_CA_BUNDLE'] = original_ca_bundle - else: - os.environ.pop('REQUESTS_CA_BUNDLE', None) - - if original_ssl_cert is not None: - os.environ['SSL_CERT_FILE'] = original_ssl_cert - else: - os.environ.pop('SSL_CERT_FILE', None) - -def clean_property_data(api_response): - """ - Simplifies the API response data by flattening nested structures. - - Args: - api_response (dict): Raw API response containing property data - - Returns: - dict: Cleaned data with simplified structure - """ - cleaned_data = {} - - for property_name, measurements in api_response.items(): - cleaned_measurements = [] - - for measurement in measurements: - cleaned_measurement = { - 'ReferenceNumber': measurement.get('ReferenceNumber'), - 'Description': measurement.get('Description', ''), - } - - # Handle Reference field - if 'Reference' in measurement: - # Check if Reference is a list or string - ref = measurement['Reference'] - cleaned_measurement['Reference'] = ref[0] if isinstance(ref, list) else ref - - # Handle Value field - value = measurement.get('Value', {}) - if isinstance(value, dict) and 'StringWithMarkup' in value: - cleaned_measurement['Value'] = value['StringWithMarkup'][0]['String'] - else: - cleaned_measurement['Value'] = str(value) - - # Remove empty values - cleaned_measurement = {k: v for k, v in cleaned_measurement.items() if v} - - cleaned_measurements.append(cleaned_measurement) - - cleaned_data[property_name] = cleaned_measurements - - return cleaned_data - -def pubchem_dap(cas): - ''' - Data un CAS in input ricerca le informazioni per la scheda di sicurezza su PubChem. - Per estrarre le proprietà di 1o (sinonimi, cid, logP, MolecularWeight, ExactMass, TPSA) livello uso Pubchempy. - Per quelle di 2o livello uso pubchemprops (Melting point) - - args: - cas : string - - ''' - with temporary_certificate('src/data/ncbi-nlm-nih-gov-catena.pem'): - try: - # Ricerca iniziale - out = pcp.get_synonyms(cas, 'name') - if out: - out = out[0] - output = {'CID' : out['CID'], - 'CAS' : cas, - 'first_pubchem_name' : out['Synonym'][0], - 'pubchem_link' : f"https://pubchem.ncbi.nlm.nih.gov/compound/{out['CID']}"} - else: - return f'No results on PubChem for {cas}' - - except Exception as E: - logger.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem search for {cas}', exc_info=True) - - try: - # Ricerca delle proprietà - properties = pcp.get_properties(['xlogp', 'molecular_weight', 'tpsa', 'exact_mass'], identifier = out['CID'], namespace='cid', searchtype=None, as_dataframe=False) - if properties: - output = {**output, **properties[0]} - else: - return output - except Exception as E: - logger.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem first level properties extraction for {cas}', exc_info=True) - - try: - # Ricerca del Melting Point - second_layer_props = get_second_layer_props(output['first_pubchem_name'], ['Melting Point', 'Dissociation Constants', 'pH']) - if second_layer_props: - second_layer_props = clean_property_data(second_layer_props) - output = {**output, **second_layer_props} - except Exception as E: - logger.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem second level properties extraction (Melting Point) for {cas}', exc_info=True) - - return output diff --git a/src/pif_compiler/services/srv_echa.py b/src/pif_compiler/services/srv_echa.py index 25094c1..07b2957 100644 --- a/src/pif_compiler/services/srv_echa.py +++ b/src/pif_compiler/services/srv_echa.py @@ -300,24 +300,6 @@ def parse_toxicology_html(html_content): #endregion -#region PDF extraction functions - -def generate_pdf_from_toxicology_info(index: dict): - with sync_playwright() as p: - browser = p.chromium.launch() - page = browser.new_page() - page.goto(index['toxicological_information_link']) - page.pdf(path=f'pdfs/{index["substance"]["rmlCas"]}.pdf') - browser.close() - if os.path.exists(f'pdfs/{index["substance"]["rmlCas"]}.pdf'): - log.info(f"PDF generated for CAS {index['substance']['rmlCas']}") - return True - else: - log.error(f"PDF generation failed for CAS {index['substance']['rmlCas']}") - return False - -#endregion - #region Orchestrator functions def echa_flow(cas) -> dict: diff --git a/src/pif_compiler/services/srv_pubchem.py b/src/pif_compiler/services/srv_pubchem.py new file mode 100644 index 0000000..115983c --- /dev/null +++ b/src/pif_compiler/services/srv_pubchem.py @@ -0,0 +1,110 @@ +import pubchempy as pcp +from pubchemprops.pubchemprops import get_second_layer_props + +from pif_compiler.functions.common_log import get_logger + +logger = get_logger() + +def clean_property_data(api_response): + """ + Simplifies the API response data by flattening nested structures. + + Args: + api_response (dict): Raw API response containing property data + + Returns: + dict: Cleaned data with simplified structure + """ + cleaned_data = {} + + for property_name, measurements in api_response.items(): + cleaned_measurements = [] + + for measurement in measurements: + cleaned_measurement = { + 'ReferenceNumber': measurement.get('ReferenceNumber'), + 'Description': measurement.get('Description', ''), + } + + # Handle Reference field + if 'Reference' in measurement: + # Check if Reference is a list or string + ref = measurement['Reference'] + cleaned_measurement['Reference'] = ref[0] if isinstance(ref, list) else ref + + # Handle Value field + value = measurement.get('Value', {}) + if isinstance(value, dict) and 'StringWithMarkup' in value: + cleaned_measurement['Value'] = value['StringWithMarkup'][0]['String'] + else: + cleaned_measurement['Value'] = str(value) + + # Remove empty values + cleaned_measurement = {k: v for k, v in cleaned_measurement.items() if v} + + cleaned_measurements.append(cleaned_measurement) + + cleaned_data[property_name] = cleaned_measurements + + return cleaned_data + +def pubchem_dap(cas): + ''' + Data un CAS in input ricerca le informazioni per la scheda di sicurezza su PubChem. + Per estrarre le proprietà di 1o (sinonimi, cid, logP, MolecularWeight, ExactMass, TPSA) livello uso Pubchempy. + Per quelle di 2o livello uso pubchemprops (Melting point) + + args: + cas : string + + ''' + try: + # Ricerca iniziale + logger.info(f"Searching PubChem for CAS: {cas}") + out = pcp.get_synonyms(cas, 'name') + if out: + out = out[0] + output = {'CID' : out['CID'], + 'CAS' : cas, + 'first_pubchem_name' : out['Synonym'][0], + 'pubchem_link' : f"https://pubchem.ncbi.nlm.nih.gov/compound/{out['CID']}"} + logger.info(f"Found PubChem entry for {cas}: CID {out['CID']}") + else: + logger.warning(f"No results on PubChem for {cas}") + return f'No results on PubChem for {cas}' + + except Exception as E: + logger.error(f'Error during pubchem search for {cas}', exc_info=True) + return None + + try: + # Ricerca delle proprietà + logger.debug(f"Fetching first level properties for CID {output['CID']}") + properties = pcp.get_properties(['xlogp', 'molecular_weight', 'tpsa', 'exact_mass'], identifier = out['CID'], namespace='cid', searchtype=None, as_dataframe=False) + if properties: + output = {**output, **properties[0]} + logger.debug(f"Successfully retrieved first level properties for {cas}") + else: + logger.warning(f"No first level properties found for {cas}") + return output + except Exception as E: + logger.error(f'Error during pubchem first level properties extraction for {cas}', exc_info=True) + + try: + # Ricerca del Melting Point + logger.debug(f"Fetching second level properties for {output['first_pubchem_name']}") + second_layer_props = get_second_layer_props(output['first_pubchem_name'], ['Melting Point', 'Dissociation Constants', 'pH']) + if second_layer_props: + second_layer_props = clean_property_data(second_layer_props) + output = {**output, **second_layer_props} + logger.debug(f"Successfully retrieved second level properties for {cas}") + except Exception as E: + logger.error(f'Error during pubchem second level properties extraction for {cas}', exc_info=True) + + return output + +if __name__ == "__main__": + # Esempio di utilizzo + cas_number = "64-17-5" # CAS per l'etanolo + result = pubchem_dap(cas_number) + print(result) \ No newline at end of file