update: new endpoint for the api (cosing, pubchem, download)

2025-12-08 10:02:44 +01:00 · 2025-12-08 10:02:44 +01:00 · f04d4f8b3e
commit f04d4f8b3e
parent 5fd12cb7a7
8 changed files with 370 additions and 158 deletions
--- a/src/pif_compiler/api/routes/api_cosing.py
+++ b/src/pif_compiler/api/routes/api_cosing.py
--- a/src/pif_compiler/api/routes/common.py
+++ b/src/pif_compiler/api/routes/common.py
@ -0,0 +1,222 @@
 from fastapi import APIRouter, HTTPException, status
 from fastapi.responses import FileResponse
 from pydantic import BaseModel, Field, HttpUrl
 from typing import Optional, Dict, Any
 import os
 from pif_compiler.functions.common_func import generate_pdf
 from pif_compiler.services.srv_pubchem import pubchem_dap
 from pif_compiler.functions.common_log import get_logger
 logger = get_logger()
 router = APIRouter()
 class GeneratePdfRequest(BaseModel):
    link: str = Field(..., description="URL of the page to convert to PDF")
    name: str = Field(..., description="Name for the generated PDF file (without extension)")
    class Config:
        json_schema_extra = {
            "example": {
                "link": "https://example.com/page",
                "name": "my_document"
            }
        }
 class GeneratePdfResponse(BaseModel):
    success: bool
    name: str
    message: str
    file_path: Optional[str] = None
@router.post("/common/generate-pdf", response_model=GeneratePdfResponse, tags=["Common"])
 async def generate_pdf_endpoint(request: GeneratePdfRequest):
    """
    Generate a PDF from a web page URL.
    This endpoint uses Playwright to:
    1. Navigate to the provided URL
    2. Render the page
    3. Generate a PDF file
    4. Save it in the 'pdfs/' directory
    If a PDF with the same name already exists, it will skip generation
    and return success immediately.
    Args:
        request: GeneratePdfRequest with the URL and desired PDF name
    Returns:
        GeneratePdfResponse with success status and file information
    """
    logger.info(f"API request received to generate PDF: name='{request.name}', link='{request.link}'")
    try:
        result = generate_pdf(request.link, request.name)
        if result:
            file_path = f"pdfs/{request.name}.pdf"
            # Check if file was already existing or newly created
            if os.path.exists(file_path):
                logger.info(f"PDF available for '{request.name}'")
                return GeneratePdfResponse(
                    success=True,
                    name=request.name,
                    message=f"PDF generated successfully or already exists",
                    file_path=file_path
                )
            else:
                logger.error(f"PDF file not found after generation for '{request.name}'")
                return GeneratePdfResponse(
                    success=False,
                    name=request.name,
                    message="PDF generation completed but file not found",
                    file_path=None
                )
        else:
            logger.error(f"PDF generation failed for '{request.name}'")
            return GeneratePdfResponse(
                success=False,
                name=request.name,
                message="PDF generation failed",
                file_path=None
            )
    except Exception as e:
        logger.error(f"Error generating PDF for '{request.name}': {str(e)}", exc_info=True)
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Internal error while generating PDF: {str(e)}"
        )
@router.get("/common/download-pdf/{name}", response_class=FileResponse, tags=["Common"])
 async def download_pdf(name: str):
    """
    Download a previously generated PDF file.
    Args:
        name: Name of the PDF file (without extension)
    Returns:
        FileResponse with the PDF file for download
    """
    logger.info(f"API request received to download PDF: name='{name}'")
    file_path = f"pdfs/{name}.pdf"
    if not os.path.exists(file_path):
        logger.warning(f"PDF file not found: {file_path}")
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail=f"PDF file '{name}' not found. Please generate it first using /common/generate-pdf"
        )
    logger.info(f"Serving PDF file: {file_path}")
    return FileResponse(
        path=file_path,
        media_type="application/pdf",
        filename=f"{name}.pdf"
    )
 class PubchemRequest(BaseModel):
    cas: str = Field(..., description="CAS number of the substance to search for in PubChem")
    class Config:
        json_schema_extra = {
            "example": {
                "cas": "64-17-5"
            }
        }
 class PubchemResponse(BaseModel):
    success: bool
    cas: str
    data: Optional[Dict[str, Any]] = None
    error: Optional[str] = None
@router.post("/common/pubchem", response_model=PubchemResponse, tags=["Common"])
 async def search_pubchem(request: PubchemRequest):
    """
    Search for substance information in PubChem database.
    This endpoint retrieves comprehensive substance data from PubChem including:
    - **Basic info**: CID, CAS, first PubChem name, PubChem link
    - **First level properties**: XLogP, molecular weight, TPSA, exact mass
    - **Second level properties**: Melting Point, Dissociation Constants, pH
    The data is automatically cleaned and formatted for easier consumption.
    Args:
        request: PubchemRequest containing the CAS number
    Returns:
        PubchemResponse with the substance data or error information
    """
    logger.info(f"API request received for PubChem search: CAS={request.cas}")
    try:
        result = pubchem_dap(request.cas)
        # Check if result is None (error occurred)
        if result is None:
            logger.error(f"PubChem search returned None for CAS: {request.cas}")
            return PubchemResponse(
                success=False,
                cas=request.cas,
                data=None,
                error="An error occurred while searching PubChem. Please check the logs for details."
            )
        # Check if result is a string (no results found)
        if isinstance(result, str):
            logger.warning(f"No results found in PubChem for CAS: {request.cas}")
            return PubchemResponse(
                success=False,
                cas=request.cas,
                data=None,
                error=result
            )
        # Successful result
        logger.info(f"Successfully retrieved PubChem data for CAS: {request.cas}")
        return PubchemResponse(
            success=True,
            cas=request.cas,
            data=result,
            error=None
        )
    except Exception as e:
        logger.error(f"Error processing PubChem request for CAS {request.cas}: {str(e)}", exc_info=True)
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Internal error while processing PubChem request: {str(e)}"
        )
@router.get("/common/health", tags=["Common"])
 async def common_health_check():
    """
    Health check endpoint for common functions service.
    Returns the status of the common functions components.
    """
    return {
        "status": "healthy",
        "service": "common-functions",
        "components": {
            "api": "operational",
            "logging": "operational",
            "utilities": "operational",
            "pubchem": "operational"
        }
    }
--- a/src/pif_compiler/functions/common_func.py
+++ b/src/pif_compiler/functions/common_func.py
@ -0,0 +1,25 @@
 from playwright.sync_api import sync_playwright
 import os
 from pif_compiler.functions.common_log import get_logger
 log = get_logger()
 def generate_pdf(link : str, name : str):
    if os.path.exists(f'pdfs/{name}.pdf'):
        log.info(f"PDF already exists for {name}, skipping generation.")
        return True
    else:
        log.info(f"Generating PDF for {name} from link: {link}")
        with sync_playwright() as p:
            browser = p.chromium.launch()
            page = browser.new_page()
            page.goto(link)
            page.pdf(path=f'pdfs/{name}.pdf')
            browser.close()
        if os.path.exists(f'pdfs/{name}.pdf'):
            log.info(f"PDF generated for {name}")
            return True
        else:
            log.error(f"PDF generation failed for {name}")
            return False
--- a/src/pif_compiler/main.py
+++ b/src/pif_compiler/main.py
@ -8,7 +8,7 @@ import time
 from pif_compiler.functions.common_log import get_logger
 # Import dei tuoi router
-from pif_compiler.api.routes import api_echa
+from pif_compiler.api.routes import api_echa, api_cosing, common
 # Configurazione logging
 logger = get_logger()
@ -123,6 +123,17 @@ app.include_router(
    tags=["ECHA"]
 )
 app.include_router(
    api_cosing.router,
    prefix="/api/v1",
    tags=["COSING"]
 )
 app.include_router(
    common.router,
    prefix="/api/v1",
    tags=["Common"]
 )
 # ==================== ROOT ENDPOINTS ====================
--- a/src/pif_compiler/services/init.py
+++ b/src/pif_compiler/services/init.py
@ -24,7 +24,7 @@ from pif_compiler.services.srv_cosing import (
 )
 # PubChem Service
-from pif_compiler.services.pubchem_service import (
+from pif_compiler.services.srv_pubchem import (
    pubchem_dap,
    clean_property_data,
 )
--- a/src/pif_compiler/services/pubchem_service.py
+++ b/src/pif_compiler/services/pubchem_service.py
@ -1,138 +0,0 @@
 import os
 from contextlib import contextmanager
 import pubchempy as pcp
 from pubchemprops.pubchemprops import get_second_layer_props
 from pif_compiler.functions.common_log import get_logger
 logger = get_logger()
@contextmanager
 def temporary_certificate(cert_path):
    """
    Context manager to temporarily change the certificate used for requests.
    Args:
        cert_path (str): Path to the certificate file to use temporarily
    Example:
        # Regular request uses default certificates
        requests.get('https://api.example.com')
        # Use custom certificate only within this block
        with temporary_certificate('custom-cert.pem'):
            requests.get('https://api.requiring.custom.cert.com')
        # Back to default certificates
        requests.get('https://api.example.com')
    """
    # Store original environment variables
    original_ca_bundle = os.environ.get('REQUESTS_CA_BUNDLE')
    original_ssl_cert = os.environ.get('SSL_CERT_FILE')
    try:
        # Set new certificate
        os.environ['REQUESTS_CA_BUNDLE'] = cert_path
        os.environ['SSL_CERT_FILE'] = cert_path
        yield
    finally:
        # Restore original environment variables
        if original_ca_bundle is not None:
            os.environ['REQUESTS_CA_BUNDLE'] = original_ca_bundle
        else:
            os.environ.pop('REQUESTS_CA_BUNDLE', None)
        if original_ssl_cert is not None:
            os.environ['SSL_CERT_FILE'] = original_ssl_cert
        else:
            os.environ.pop('SSL_CERT_FILE', None)
 def clean_property_data(api_response):
    """
    Simplifies the API response data by flattening nested structures.
    Args:
        api_response (dict): Raw API response containing property data
    Returns:
        dict: Cleaned data with simplified structure
    """
    cleaned_data = {}
    for property_name, measurements in api_response.items():
        cleaned_measurements = []
        for measurement in measurements:
            cleaned_measurement = {
                'ReferenceNumber': measurement.get('ReferenceNumber'),
                'Description': measurement.get('Description', ''),
            }
            # Handle Reference field
            if 'Reference' in measurement:
                # Check if Reference is a list or string
                ref = measurement['Reference']
                cleaned_measurement['Reference'] = ref[0] if isinstance(ref, list) else ref
            # Handle Value field
            value = measurement.get('Value', {})
            if isinstance(value, dict) and 'StringWithMarkup' in value:
                cleaned_measurement['Value'] = value['StringWithMarkup'][0]['String']
            else:
                cleaned_measurement['Value'] = str(value)
            # Remove empty values
            cleaned_measurement = {k: v for k, v in cleaned_measurement.items() if v}
            cleaned_measurements.append(cleaned_measurement)
        cleaned_data[property_name] = cleaned_measurements
    return cleaned_data
 def pubchem_dap(cas):
    '''
    Data un CAS in input ricerca le informazioni per la scheda di sicurezza su PubChem.
    Per estrarre le proprietà di 1o (sinonimi, cid, logP, MolecularWeight, ExactMass, TPSA) livello uso Pubchempy. 
    Per quelle di 2o livello uso pubchemprops (Melting point)
    args:
    cas : string 
    '''
    with temporary_certificate('src/data/ncbi-nlm-nih-gov-catena.pem'):
        try:
            # Ricerca iniziale
            out = pcp.get_synonyms(cas, 'name')
            if out:
                out = out[0]
                output = {'CID' : out['CID'],
                        'CAS' : cas,
                        'first_pubchem_name' : out['Synonym'][0],
                        'pubchem_link' : f"https://pubchem.ncbi.nlm.nih.gov/compound/{out['CID']}"}
            else:
                return f'No results on PubChem for {cas}'
        except Exception as E:
                logger.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem search for {cas}', exc_info=True)
        try:
            # Ricerca delle proprietà
            properties = pcp.get_properties(['xlogp', 'molecular_weight', 'tpsa', 'exact_mass'], identifier = out['CID'], namespace='cid', searchtype=None, as_dataframe=False)
            if properties:
                output = {**output, **properties[0]}
            else:
                return output
        except Exception as E:
            logger.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem first level properties extraction for {cas}', exc_info=True)
        try:
            # Ricerca del Melting Point
            second_layer_props = get_second_layer_props(output['first_pubchem_name'], ['Melting Point', 'Dissociation Constants', 'pH'])
            if second_layer_props:
                second_layer_props = clean_property_data(second_layer_props)
                output = {**output, **second_layer_props}
        except Exception as E:
            logger.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem second level properties extraction (Melting Point) for {cas}', exc_info=True)
        return output
--- a/src/pif_compiler/services/srv_echa.py
+++ b/src/pif_compiler/services/srv_echa.py
@ -300,24 +300,6 @@ def parse_toxicology_html(html_content):
 #endregion
 #region PDF extraction functions
 def generate_pdf_from_toxicology_info(index: dict):
    with sync_playwright() as p:
        browser = p.chromium.launch()
        page = browser.new_page()
        page.goto(index['toxicological_information_link'])
        page.pdf(path=f'pdfs/{index["substance"]["rmlCas"]}.pdf')
        browser.close()
    if os.path.exists(f'pdfs/{index["substance"]["rmlCas"]}.pdf'):
        log.info(f"PDF generated for CAS {index['substance']['rmlCas']}")
        return True
    else:
        log.error(f"PDF generation failed for CAS {index['substance']['rmlCas']}")
        return False
 #endregion
 #region Orchestrator functions
 def echa_flow(cas) -> dict:
--- a/src/pif_compiler/services/srv_pubchem.py
+++ b/src/pif_compiler/services/srv_pubchem.py
@ -0,0 +1,110 @@
 import pubchempy as pcp
 from pubchemprops.pubchemprops import get_second_layer_props
 from pif_compiler.functions.common_log import get_logger
 logger = get_logger()
 def clean_property_data(api_response):
    """
    Simplifies the API response data by flattening nested structures.
    Args:
        api_response (dict): Raw API response containing property data
    Returns:
        dict: Cleaned data with simplified structure
    """
    cleaned_data = {}
    for property_name, measurements in api_response.items():
        cleaned_measurements = []
        for measurement in measurements:
            cleaned_measurement = {
                'ReferenceNumber': measurement.get('ReferenceNumber'),
                'Description': measurement.get('Description', ''),
            }
            # Handle Reference field
            if 'Reference' in measurement:
                # Check if Reference is a list or string
                ref = measurement['Reference']
                cleaned_measurement['Reference'] = ref[0] if isinstance(ref, list) else ref
            # Handle Value field
            value = measurement.get('Value', {})
            if isinstance(value, dict) and 'StringWithMarkup' in value:
                cleaned_measurement['Value'] = value['StringWithMarkup'][0]['String']
            else:
                cleaned_measurement['Value'] = str(value)
            # Remove empty values
            cleaned_measurement = {k: v for k, v in cleaned_measurement.items() if v}
            cleaned_measurements.append(cleaned_measurement)
        cleaned_data[property_name] = cleaned_measurements
    return cleaned_data
 def pubchem_dap(cas):
    '''
    Data un CAS in input ricerca le informazioni per la scheda di sicurezza su PubChem.
    Per estrarre le proprietà di 1o (sinonimi, cid, logP, MolecularWeight, ExactMass, TPSA) livello uso Pubchempy.
    Per quelle di 2o livello uso pubchemprops (Melting point)
    args:
    cas : string
    '''
    try:
        # Ricerca iniziale
        logger.info(f"Searching PubChem for CAS: {cas}")
        out = pcp.get_synonyms(cas, 'name')
        if out:
            out = out[0]
            output = {'CID' : out['CID'],
                    'CAS' : cas,
                    'first_pubchem_name' : out['Synonym'][0],
                    'pubchem_link' : f"https://pubchem.ncbi.nlm.nih.gov/compound/{out['CID']}"}
            logger.info(f"Found PubChem entry for {cas}: CID {out['CID']}")
        else:
            logger.warning(f"No results on PubChem for {cas}")
            return f'No results on PubChem for {cas}'
    except Exception as E:
        logger.error(f'Error during pubchem search for {cas}', exc_info=True)
        return None
    try:
        # Ricerca delle proprietà
        logger.debug(f"Fetching first level properties for CID {output['CID']}")
        properties = pcp.get_properties(['xlogp', 'molecular_weight', 'tpsa', 'exact_mass'], identifier = out['CID'], namespace='cid', searchtype=None, as_dataframe=False)
        if properties:
            output = {**output, **properties[0]}
            logger.debug(f"Successfully retrieved first level properties for {cas}")
        else:
            logger.warning(f"No first level properties found for {cas}")
            return output
    except Exception as E:
        logger.error(f'Error during pubchem first level properties extraction for {cas}', exc_info=True)
    try:
        # Ricerca del Melting Point
        logger.debug(f"Fetching second level properties for {output['first_pubchem_name']}")
        second_layer_props = get_second_layer_props(output['first_pubchem_name'], ['Melting Point', 'Dissociation Constants', 'pH'])
        if second_layer_props:
            second_layer_props = clean_property_data(second_layer_props)
            output = {**output, **second_layer_props}
            logger.debug(f"Successfully retrieved second level properties for {cas}")
    except Exception as E:
        logger.error(f'Error during pubchem second level properties extraction for {cas}', exc_info=True)
    return output
 if __name__ == "__main__":
    # Esempio di utilizzo
    cas_number = "64-17-5"  # CAS per l'etanolo
    result = pubchem_dap(cas_number)
    print(result)