update: new endpoint for the api (cosing, pubchem, download)

2025-12-08 10:02:44 +01:00 · 2025-12-08 10:02:44 +01:00 · f04d4f8b3e
commit f04d4f8b3e
parent 5fd12cb7a7
8 changed files with 370 additions and 158 deletions
--- a/src/pif_compiler/api/routes/api_cosing.py
+++ b/src/pif_compiler/api/routes/api_cosing.py
--- a/src/pif_compiler/api/routes/common.py
+++ b/src/pif_compiler/api/routes/common.py
@ -0,0 +1,222 @@
+from fastapi import APIRouter, HTTPException, status
+from fastapi.responses import FileResponse
+from pydantic import BaseModel, Field, HttpUrl
+from typing import Optional, Dict, Any
+import os
+
+from pif_compiler.functions.common_func import generate_pdf
+from pif_compiler.services.srv_pubchem import pubchem_dap
+from pif_compiler.functions.common_log import get_logger
+
+logger = get_logger()
+
+router = APIRouter()
+
+
+class GeneratePdfRequest(BaseModel):
+    link: str = Field(..., description="URL of the page to convert to PDF")
+    name: str = Field(..., description="Name for the generated PDF file (without extension)")
+
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "link": "https://example.com/page",
+                "name": "my_document"
+            }
+        }
+
+
+class GeneratePdfResponse(BaseModel):
+    success: bool
+    name: str
+    message: str
+    file_path: Optional[str] = None
+
+
+@router.post("/common/generate-pdf", response_model=GeneratePdfResponse, tags=["Common"])
+async def generate_pdf_endpoint(request: GeneratePdfRequest):
+    """
+    Generate a PDF from a web page URL.
+
+    This endpoint uses Playwright to:
+    1. Navigate to the provided URL
+    2. Render the page
+    3. Generate a PDF file
+    4. Save it in the 'pdfs/' directory
+
+    If a PDF with the same name already exists, it will skip generation
+    and return success immediately.
+
+    Args:
+        request: GeneratePdfRequest with the URL and desired PDF name
+
+    Returns:
+        GeneratePdfResponse with success status and file information
+    """
+    logger.info(f"API request received to generate PDF: name='{request.name}', link='{request.link}'")
+
+    try:
+        result = generate_pdf(request.link, request.name)
+
+        if result:
+            file_path = f"pdfs/{request.name}.pdf"
+
+            # Check if file was already existing or newly created
+            if os.path.exists(file_path):
+                logger.info(f"PDF available for '{request.name}'")
+                return GeneratePdfResponse(
+                    success=True,
+                    name=request.name,
+                    message=f"PDF generated successfully or already exists",
+                    file_path=file_path
+                )
+            else:
+                logger.error(f"PDF file not found after generation for '{request.name}'")
+                return GeneratePdfResponse(
+                    success=False,
+                    name=request.name,
+                    message="PDF generation completed but file not found",
+                    file_path=None
+                )
+        else:
+            logger.error(f"PDF generation failed for '{request.name}'")
+            return GeneratePdfResponse(
+                success=False,
+                name=request.name,
+                message="PDF generation failed",
+                file_path=None
+            )
+
+    except Exception as e:
+        logger.error(f"Error generating PDF for '{request.name}': {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Internal error while generating PDF: {str(e)}"
+        )
+
+
+@router.get("/common/download-pdf/{name}", response_class=FileResponse, tags=["Common"])
+async def download_pdf(name: str):
+    """
+    Download a previously generated PDF file.
+
+    Args:
+        name: Name of the PDF file (without extension)
+
+    Returns:
+        FileResponse with the PDF file for download
+    """
+    logger.info(f"API request received to download PDF: name='{name}'")
+
+    file_path = f"pdfs/{name}.pdf"
+
+    if not os.path.exists(file_path):
+        logger.warning(f"PDF file not found: {file_path}")
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"PDF file '{name}' not found. Please generate it first using /common/generate-pdf"
+        )
+
+    logger.info(f"Serving PDF file: {file_path}")
+    return FileResponse(
+        path=file_path,
+        media_type="application/pdf",
+        filename=f"{name}.pdf"
+    )
+
+
+class PubchemRequest(BaseModel):
+    cas: str = Field(..., description="CAS number of the substance to search for in PubChem")
+
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "cas": "64-17-5"
+            }
+        }
+
+
+class PubchemResponse(BaseModel):
+    success: bool
+    cas: str
+    data: Optional[Dict[str, Any]] = None
+    error: Optional[str] = None
+
+
+@router.post("/common/pubchem", response_model=PubchemResponse, tags=["Common"])
+async def search_pubchem(request: PubchemRequest):
+    """
+    Search for substance information in PubChem database.
+
+    This endpoint retrieves comprehensive substance data from PubChem including:
+    - **Basic info**: CID, CAS, first PubChem name, PubChem link
+    - **First level properties**: XLogP, molecular weight, TPSA, exact mass
+    - **Second level properties**: Melting Point, Dissociation Constants, pH
+
+    The data is automatically cleaned and formatted for easier consumption.
+
+    Args:
+        request: PubchemRequest containing the CAS number
+
+    Returns:
+        PubchemResponse with the substance data or error information
+    """
+    logger.info(f"API request received for PubChem search: CAS={request.cas}")
+
+    try:
+        result = pubchem_dap(request.cas)
+
+        # Check if result is None (error occurred)
+        if result is None:
+            logger.error(f"PubChem search returned None for CAS: {request.cas}")
+            return PubchemResponse(
+                success=False,
+                cas=request.cas,
+                data=None,
+                error="An error occurred while searching PubChem. Please check the logs for details."
+            )
+
+        # Check if result is a string (no results found)
+        if isinstance(result, str):
+            logger.warning(f"No results found in PubChem for CAS: {request.cas}")
+            return PubchemResponse(
+                success=False,
+                cas=request.cas,
+                data=None,
+                error=result
+            )
+
+        # Successful result
+        logger.info(f"Successfully retrieved PubChem data for CAS: {request.cas}")
+        return PubchemResponse(
+            success=True,
+            cas=request.cas,
+            data=result,
+            error=None
+        )
+
+    except Exception as e:
+        logger.error(f"Error processing PubChem request for CAS {request.cas}: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Internal error while processing PubChem request: {str(e)}"
+        )
+
+
+@router.get("/common/health", tags=["Common"])
+async def common_health_check():
+    """
+    Health check endpoint for common functions service.
+
+    Returns the status of the common functions components.
+    """
+    return {
+        "status": "healthy",
+        "service": "common-functions",
+        "components": {
+            "api": "operational",
+            "logging": "operational",
+            "utilities": "operational",
+            "pubchem": "operational"
+        }
+    }
--- a/src/pif_compiler/functions/common_func.py
+++ b/src/pif_compiler/functions/common_func.py
@ -0,0 +1,25 @@
+from playwright.sync_api import sync_playwright
+import os
+
+from pif_compiler.functions.common_log import get_logger
+
+log = get_logger()
+
+def generate_pdf(link : str, name : str):
+    if os.path.exists(f'pdfs/{name}.pdf'):
+        log.info(f"PDF already exists for {name}, skipping generation.")
+        return True
+    else:
+        log.info(f"Generating PDF for {name} from link: {link}")
+        with sync_playwright() as p:
+            browser = p.chromium.launch()
+            page = browser.new_page()
+            page.goto(link)
+            page.pdf(path=f'pdfs/{name}.pdf')
+            browser.close()
+        if os.path.exists(f'pdfs/{name}.pdf'):
+            log.info(f"PDF generated for {name}")
+            return True
+        else:
+            log.error(f"PDF generation failed for {name}")
+            return False
--- a/src/pif_compiler/main.py
+++ b/src/pif_compiler/main.py
@ -8,7 +8,7 @@ import time
 from pif_compiler.functions.common_log import get_logger

 # Import dei tuoi router
-from pif_compiler.api.routes import api_echa
+from pif_compiler.api.routes import api_echa, api_cosing, common

 # Configurazione logging
 logger = get_logger()
@ -123,6 +123,17 @@ app.include_router(
    tags=["ECHA"]
 )

+app.include_router(
+    api_cosing.router,
+    prefix="/api/v1",
+    tags=["COSING"]
+)
+
+app.include_router(
+    common.router,
+    prefix="/api/v1",
+    tags=["Common"]
+)

 # ==================== ROOT ENDPOINTS ====================

--- a/src/pif_compiler/services/init.py
+++ b/src/pif_compiler/services/init.py
@ -24,7 +24,7 @@ from pif_compiler.services.srv_cosing import (
 )

 # PubChem Service
-from pif_compiler.services.pubchem_service import (
+from pif_compiler.services.srv_pubchem import (
    pubchem_dap,
    clean_property_data,
 )
--- a/src/pif_compiler/services/pubchem_service.py
+++ b/src/pif_compiler/services/pubchem_service.py
@ -1,138 +0,0 @@
-import os
-from contextlib import contextmanager
-import pubchempy as pcp
-from pubchemprops.pubchemprops import get_second_layer_props
-
-from pif_compiler.functions.common_log import get_logger
-
-logger = get_logger()
-
-@contextmanager
-def temporary_certificate(cert_path):
-    """
-    Context manager to temporarily change the certificate used for requests.
-    
-    Args:
-        cert_path (str): Path to the certificate file to use temporarily
-        
-    Example:
-        # Regular request uses default certificates
-        requests.get('https://api.example.com')
-        
-        # Use custom certificate only within this block
-        with temporary_certificate('custom-cert.pem'):
-            requests.get('https://api.requiring.custom.cert.com')
-            
-        # Back to default certificates
-        requests.get('https://api.example.com')
-    """
-    # Store original environment variables
-    original_ca_bundle = os.environ.get('REQUESTS_CA_BUNDLE')
-    original_ssl_cert = os.environ.get('SSL_CERT_FILE')
-    
-    try:
-        # Set new certificate
-        os.environ['REQUESTS_CA_BUNDLE'] = cert_path
-        os.environ['SSL_CERT_FILE'] = cert_path
-        yield
-    finally:
-        # Restore original environment variables
-        if original_ca_bundle is not None:
-            os.environ['REQUESTS_CA_BUNDLE'] = original_ca_bundle
-        else:
-            os.environ.pop('REQUESTS_CA_BUNDLE', None)
-            
-        if original_ssl_cert is not None:
-            os.environ['SSL_CERT_FILE'] = original_ssl_cert
-        else:
-            os.environ.pop('SSL_CERT_FILE', None)
-
-def clean_property_data(api_response):
-    """
-    Simplifies the API response data by flattening nested structures.
-    
-    Args:
-        api_response (dict): Raw API response containing property data
-        
-    Returns:
-        dict: Cleaned data with simplified structure
-    """
-    cleaned_data = {}
-    
-    for property_name, measurements in api_response.items():
-        cleaned_measurements = []
-        
-        for measurement in measurements:
-            cleaned_measurement = {
-                'ReferenceNumber': measurement.get('ReferenceNumber'),
-                'Description': measurement.get('Description', ''),
-            }
-            
-            # Handle Reference field
-            if 'Reference' in measurement:
-                # Check if Reference is a list or string
-                ref = measurement['Reference']
-                cleaned_measurement['Reference'] = ref[0] if isinstance(ref, list) else ref
-            
-            # Handle Value field
-            value = measurement.get('Value', {})
-            if isinstance(value, dict) and 'StringWithMarkup' in value:
-                cleaned_measurement['Value'] = value['StringWithMarkup'][0]['String']
-            else:
-                cleaned_measurement['Value'] = str(value)
-                
-            # Remove empty values
-            cleaned_measurement = {k: v for k, v in cleaned_measurement.items() if v}
-            
-            cleaned_measurements.append(cleaned_measurement)
-            
-        cleaned_data[property_name] = cleaned_measurements
-    
-    return cleaned_data
-
-def pubchem_dap(cas):
-    '''
-    Data un CAS in input ricerca le informazioni per la scheda di sicurezza su PubChem.
-    Per estrarre le proprietà di 1o (sinonimi, cid, logP, MolecularWeight, ExactMass, TPSA) livello uso Pubchempy. 
-    Per quelle di 2o livello uso pubchemprops (Melting point)
-    
-    args:
-    cas : string 
-
-    '''
-    with temporary_certificate('src/data/ncbi-nlm-nih-gov-catena.pem'):
-        try:
-            # Ricerca iniziale
-            out = pcp.get_synonyms(cas, 'name')
-            if out:
-                out = out[0]
-                output = {'CID' : out['CID'],
-                        'CAS' : cas,
-                        'first_pubchem_name' : out['Synonym'][0],
-                        'pubchem_link' : f"https://pubchem.ncbi.nlm.nih.gov/compound/{out['CID']}"}
-            else:
-                return f'No results on PubChem for {cas}'
-
-        except Exception as E:
-                logger.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem search for {cas}', exc_info=True)
-
-        try:
-            # Ricerca delle proprietà
-            properties = pcp.get_properties(['xlogp', 'molecular_weight', 'tpsa', 'exact_mass'], identifier = out['CID'], namespace='cid', searchtype=None, as_dataframe=False)
-            if properties:
-                output = {**output, **properties[0]}
-            else:
-                return output
-        except Exception as E:
-            logger.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem first level properties extraction for {cas}', exc_info=True)
-        
-        try:
-            # Ricerca del Melting Point
-            second_layer_props = get_second_layer_props(output['first_pubchem_name'], ['Melting Point', 'Dissociation Constants', 'pH'])
-            if second_layer_props:
-                second_layer_props = clean_property_data(second_layer_props)
-                output = {**output, **second_layer_props}
-        except Exception as E:
-            logger.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem second level properties extraction (Melting Point) for {cas}', exc_info=True)
-        
-        return output
--- a/src/pif_compiler/services/srv_echa.py
+++ b/src/pif_compiler/services/srv_echa.py
@ -300,24 +300,6 @@ def parse_toxicology_html(html_content):

 #endregion

-#region PDF extraction functions
-
-def generate_pdf_from_toxicology_info(index: dict):
-    with sync_playwright() as p:
-        browser = p.chromium.launch()
-        page = browser.new_page()
-        page.goto(index['toxicological_information_link'])
-        page.pdf(path=f'pdfs/{index["substance"]["rmlCas"]}.pdf')
-        browser.close()
-    if os.path.exists(f'pdfs/{index["substance"]["rmlCas"]}.pdf'):
-        log.info(f"PDF generated for CAS {index['substance']['rmlCas']}")
-        return True
-    else:
-        log.error(f"PDF generation failed for CAS {index['substance']['rmlCas']}")
-        return False
-        
-#endregion
-
 #region Orchestrator functions

 def echa_flow(cas) -> dict:
--- a/src/pif_compiler/services/srv_pubchem.py
+++ b/src/pif_compiler/services/srv_pubchem.py
@ -0,0 +1,110 @@
+import pubchempy as pcp
+from pubchemprops.pubchemprops import get_second_layer_props
+
+from pif_compiler.functions.common_log import get_logger
+
+logger = get_logger()
+
+def clean_property_data(api_response):
+    """
+    Simplifies the API response data by flattening nested structures.
+    
+    Args:
+        api_response (dict): Raw API response containing property data
+        
+    Returns:
+        dict: Cleaned data with simplified structure
+    """
+    cleaned_data = {}
+    
+    for property_name, measurements in api_response.items():
+        cleaned_measurements = []
+        
+        for measurement in measurements:
+            cleaned_measurement = {
+                'ReferenceNumber': measurement.get('ReferenceNumber'),
+                'Description': measurement.get('Description', ''),
+            }
+            
+            # Handle Reference field
+            if 'Reference' in measurement:
+                # Check if Reference is a list or string
+                ref = measurement['Reference']
+                cleaned_measurement['Reference'] = ref[0] if isinstance(ref, list) else ref
+            
+            # Handle Value field
+            value = measurement.get('Value', {})
+            if isinstance(value, dict) and 'StringWithMarkup' in value:
+                cleaned_measurement['Value'] = value['StringWithMarkup'][0]['String']
+            else:
+                cleaned_measurement['Value'] = str(value)
+                
+            # Remove empty values
+            cleaned_measurement = {k: v for k, v in cleaned_measurement.items() if v}
+            
+            cleaned_measurements.append(cleaned_measurement)
+            
+        cleaned_data[property_name] = cleaned_measurements
+    
+    return cleaned_data
+
+def pubchem_dap(cas):
+    '''
+    Data un CAS in input ricerca le informazioni per la scheda di sicurezza su PubChem.
+    Per estrarre le proprietà di 1o (sinonimi, cid, logP, MolecularWeight, ExactMass, TPSA) livello uso Pubchempy.
+    Per quelle di 2o livello uso pubchemprops (Melting point)
+
+    args:
+    cas : string
+
+    '''
+    try:
+        # Ricerca iniziale
+        logger.info(f"Searching PubChem for CAS: {cas}")
+        out = pcp.get_synonyms(cas, 'name')
+        if out:
+            out = out[0]
+            output = {'CID' : out['CID'],
+                    'CAS' : cas,
+                    'first_pubchem_name' : out['Synonym'][0],
+                    'pubchem_link' : f"https://pubchem.ncbi.nlm.nih.gov/compound/{out['CID']}"}
+            logger.info(f"Found PubChem entry for {cas}: CID {out['CID']}")
+        else:
+            logger.warning(f"No results on PubChem for {cas}")
+            return f'No results on PubChem for {cas}'
+
+    except Exception as E:
+        logger.error(f'Error during pubchem search for {cas}', exc_info=True)
+        return None
+
+    try:
+        # Ricerca delle proprietà
+        logger.debug(f"Fetching first level properties for CID {output['CID']}")
+        properties = pcp.get_properties(['xlogp', 'molecular_weight', 'tpsa', 'exact_mass'], identifier = out['CID'], namespace='cid', searchtype=None, as_dataframe=False)
+        if properties:
+            output = {**output, **properties[0]}
+            logger.debug(f"Successfully retrieved first level properties for {cas}")
+        else:
+            logger.warning(f"No first level properties found for {cas}")
+            return output
+    except Exception as E:
+        logger.error(f'Error during pubchem first level properties extraction for {cas}', exc_info=True)
+
+    try:
+        # Ricerca del Melting Point
+        logger.debug(f"Fetching second level properties for {output['first_pubchem_name']}")
+        second_layer_props = get_second_layer_props(output['first_pubchem_name'], ['Melting Point', 'Dissociation Constants', 'pH'])
+        if second_layer_props:
+            second_layer_props = clean_property_data(second_layer_props)
+            output = {**output, **second_layer_props}
+            logger.debug(f"Successfully retrieved second level properties for {cas}")
+    except Exception as E:
+        logger.error(f'Error during pubchem second level properties extraction for {cas}', exc_info=True)
+
+    return output
+
+if __name__ == "__main__":
+    # Esempio di utilizzo
+    cas_number = "64-17-5"  # CAS per l'etanolo
+    result = pubchem_dap(cas_number)
+    print(result)