update: new endpoint for the api (cosing, pubchem, download)

This commit is contained in:
adish-rmr 2025-12-08 10:02:44 +01:00
parent 5fd12cb7a7
commit f04d4f8b3e
8 changed files with 370 additions and 158 deletions

View file

@ -0,0 +1,222 @@
from fastapi import APIRouter, HTTPException, status
from fastapi.responses import FileResponse
from pydantic import BaseModel, Field, HttpUrl
from typing import Optional, Dict, Any
import os
from pif_compiler.functions.common_func import generate_pdf
from pif_compiler.services.srv_pubchem import pubchem_dap
from pif_compiler.functions.common_log import get_logger
logger = get_logger()
router = APIRouter()
class GeneratePdfRequest(BaseModel):
link: str = Field(..., description="URL of the page to convert to PDF")
name: str = Field(..., description="Name for the generated PDF file (without extension)")
class Config:
json_schema_extra = {
"example": {
"link": "https://example.com/page",
"name": "my_document"
}
}
class GeneratePdfResponse(BaseModel):
success: bool
name: str
message: str
file_path: Optional[str] = None
@router.post("/common/generate-pdf", response_model=GeneratePdfResponse, tags=["Common"])
async def generate_pdf_endpoint(request: GeneratePdfRequest):
"""
Generate a PDF from a web page URL.
This endpoint uses Playwright to:
1. Navigate to the provided URL
2. Render the page
3. Generate a PDF file
4. Save it in the 'pdfs/' directory
If a PDF with the same name already exists, it will skip generation
and return success immediately.
Args:
request: GeneratePdfRequest with the URL and desired PDF name
Returns:
GeneratePdfResponse with success status and file information
"""
logger.info(f"API request received to generate PDF: name='{request.name}', link='{request.link}'")
try:
result = generate_pdf(request.link, request.name)
if result:
file_path = f"pdfs/{request.name}.pdf"
# Check if file was already existing or newly created
if os.path.exists(file_path):
logger.info(f"PDF available for '{request.name}'")
return GeneratePdfResponse(
success=True,
name=request.name,
message=f"PDF generated successfully or already exists",
file_path=file_path
)
else:
logger.error(f"PDF file not found after generation for '{request.name}'")
return GeneratePdfResponse(
success=False,
name=request.name,
message="PDF generation completed but file not found",
file_path=None
)
else:
logger.error(f"PDF generation failed for '{request.name}'")
return GeneratePdfResponse(
success=False,
name=request.name,
message="PDF generation failed",
file_path=None
)
except Exception as e:
logger.error(f"Error generating PDF for '{request.name}': {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Internal error while generating PDF: {str(e)}"
)
@router.get("/common/download-pdf/{name}", response_class=FileResponse, tags=["Common"])
async def download_pdf(name: str):
"""
Download a previously generated PDF file.
Args:
name: Name of the PDF file (without extension)
Returns:
FileResponse with the PDF file for download
"""
logger.info(f"API request received to download PDF: name='{name}'")
file_path = f"pdfs/{name}.pdf"
if not os.path.exists(file_path):
logger.warning(f"PDF file not found: {file_path}")
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"PDF file '{name}' not found. Please generate it first using /common/generate-pdf"
)
logger.info(f"Serving PDF file: {file_path}")
return FileResponse(
path=file_path,
media_type="application/pdf",
filename=f"{name}.pdf"
)
class PubchemRequest(BaseModel):
cas: str = Field(..., description="CAS number of the substance to search for in PubChem")
class Config:
json_schema_extra = {
"example": {
"cas": "64-17-5"
}
}
class PubchemResponse(BaseModel):
success: bool
cas: str
data: Optional[Dict[str, Any]] = None
error: Optional[str] = None
@router.post("/common/pubchem", response_model=PubchemResponse, tags=["Common"])
async def search_pubchem(request: PubchemRequest):
"""
Search for substance information in PubChem database.
This endpoint retrieves comprehensive substance data from PubChem including:
- **Basic info**: CID, CAS, first PubChem name, PubChem link
- **First level properties**: XLogP, molecular weight, TPSA, exact mass
- **Second level properties**: Melting Point, Dissociation Constants, pH
The data is automatically cleaned and formatted for easier consumption.
Args:
request: PubchemRequest containing the CAS number
Returns:
PubchemResponse with the substance data or error information
"""
logger.info(f"API request received for PubChem search: CAS={request.cas}")
try:
result = pubchem_dap(request.cas)
# Check if result is None (error occurred)
if result is None:
logger.error(f"PubChem search returned None for CAS: {request.cas}")
return PubchemResponse(
success=False,
cas=request.cas,
data=None,
error="An error occurred while searching PubChem. Please check the logs for details."
)
# Check if result is a string (no results found)
if isinstance(result, str):
logger.warning(f"No results found in PubChem for CAS: {request.cas}")
return PubchemResponse(
success=False,
cas=request.cas,
data=None,
error=result
)
# Successful result
logger.info(f"Successfully retrieved PubChem data for CAS: {request.cas}")
return PubchemResponse(
success=True,
cas=request.cas,
data=result,
error=None
)
except Exception as e:
logger.error(f"Error processing PubChem request for CAS {request.cas}: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Internal error while processing PubChem request: {str(e)}"
)
@router.get("/common/health", tags=["Common"])
async def common_health_check():
"""
Health check endpoint for common functions service.
Returns the status of the common functions components.
"""
return {
"status": "healthy",
"service": "common-functions",
"components": {
"api": "operational",
"logging": "operational",
"utilities": "operational",
"pubchem": "operational"
}
}

View file

@ -0,0 +1,25 @@
from playwright.sync_api import sync_playwright
import os
from pif_compiler.functions.common_log import get_logger
log = get_logger()
def generate_pdf(link : str, name : str):
if os.path.exists(f'pdfs/{name}.pdf'):
log.info(f"PDF already exists for {name}, skipping generation.")
return True
else:
log.info(f"Generating PDF for {name} from link: {link}")
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(link)
page.pdf(path=f'pdfs/{name}.pdf')
browser.close()
if os.path.exists(f'pdfs/{name}.pdf'):
log.info(f"PDF generated for {name}")
return True
else:
log.error(f"PDF generation failed for {name}")
return False

View file

@ -8,7 +8,7 @@ import time
from pif_compiler.functions.common_log import get_logger from pif_compiler.functions.common_log import get_logger
# Import dei tuoi router # Import dei tuoi router
from pif_compiler.api.routes import api_echa from pif_compiler.api.routes import api_echa, api_cosing, common
# Configurazione logging # Configurazione logging
logger = get_logger() logger = get_logger()
@ -123,6 +123,17 @@ app.include_router(
tags=["ECHA"] tags=["ECHA"]
) )
app.include_router(
api_cosing.router,
prefix="/api/v1",
tags=["COSING"]
)
app.include_router(
common.router,
prefix="/api/v1",
tags=["Common"]
)
# ==================== ROOT ENDPOINTS ==================== # ==================== ROOT ENDPOINTS ====================

View file

@ -24,7 +24,7 @@ from pif_compiler.services.srv_cosing import (
) )
# PubChem Service # PubChem Service
from pif_compiler.services.pubchem_service import ( from pif_compiler.services.srv_pubchem import (
pubchem_dap, pubchem_dap,
clean_property_data, clean_property_data,
) )

View file

@ -1,138 +0,0 @@
import os
from contextlib import contextmanager
import pubchempy as pcp
from pubchemprops.pubchemprops import get_second_layer_props
from pif_compiler.functions.common_log import get_logger
logger = get_logger()
@contextmanager
def temporary_certificate(cert_path):
"""
Context manager to temporarily change the certificate used for requests.
Args:
cert_path (str): Path to the certificate file to use temporarily
Example:
# Regular request uses default certificates
requests.get('https://api.example.com')
# Use custom certificate only within this block
with temporary_certificate('custom-cert.pem'):
requests.get('https://api.requiring.custom.cert.com')
# Back to default certificates
requests.get('https://api.example.com')
"""
# Store original environment variables
original_ca_bundle = os.environ.get('REQUESTS_CA_BUNDLE')
original_ssl_cert = os.environ.get('SSL_CERT_FILE')
try:
# Set new certificate
os.environ['REQUESTS_CA_BUNDLE'] = cert_path
os.environ['SSL_CERT_FILE'] = cert_path
yield
finally:
# Restore original environment variables
if original_ca_bundle is not None:
os.environ['REQUESTS_CA_BUNDLE'] = original_ca_bundle
else:
os.environ.pop('REQUESTS_CA_BUNDLE', None)
if original_ssl_cert is not None:
os.environ['SSL_CERT_FILE'] = original_ssl_cert
else:
os.environ.pop('SSL_CERT_FILE', None)
def clean_property_data(api_response):
"""
Simplifies the API response data by flattening nested structures.
Args:
api_response (dict): Raw API response containing property data
Returns:
dict: Cleaned data with simplified structure
"""
cleaned_data = {}
for property_name, measurements in api_response.items():
cleaned_measurements = []
for measurement in measurements:
cleaned_measurement = {
'ReferenceNumber': measurement.get('ReferenceNumber'),
'Description': measurement.get('Description', ''),
}
# Handle Reference field
if 'Reference' in measurement:
# Check if Reference is a list or string
ref = measurement['Reference']
cleaned_measurement['Reference'] = ref[0] if isinstance(ref, list) else ref
# Handle Value field
value = measurement.get('Value', {})
if isinstance(value, dict) and 'StringWithMarkup' in value:
cleaned_measurement['Value'] = value['StringWithMarkup'][0]['String']
else:
cleaned_measurement['Value'] = str(value)
# Remove empty values
cleaned_measurement = {k: v for k, v in cleaned_measurement.items() if v}
cleaned_measurements.append(cleaned_measurement)
cleaned_data[property_name] = cleaned_measurements
return cleaned_data
def pubchem_dap(cas):
'''
Data un CAS in input ricerca le informazioni per la scheda di sicurezza su PubChem.
Per estrarre le proprietà di 1o (sinonimi, cid, logP, MolecularWeight, ExactMass, TPSA) livello uso Pubchempy.
Per quelle di 2o livello uso pubchemprops (Melting point)
args:
cas : string
'''
with temporary_certificate('src/data/ncbi-nlm-nih-gov-catena.pem'):
try:
# Ricerca iniziale
out = pcp.get_synonyms(cas, 'name')
if out:
out = out[0]
output = {'CID' : out['CID'],
'CAS' : cas,
'first_pubchem_name' : out['Synonym'][0],
'pubchem_link' : f"https://pubchem.ncbi.nlm.nih.gov/compound/{out['CID']}"}
else:
return f'No results on PubChem for {cas}'
except Exception as E:
logger.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem search for {cas}', exc_info=True)
try:
# Ricerca delle proprietà
properties = pcp.get_properties(['xlogp', 'molecular_weight', 'tpsa', 'exact_mass'], identifier = out['CID'], namespace='cid', searchtype=None, as_dataframe=False)
if properties:
output = {**output, **properties[0]}
else:
return output
except Exception as E:
logger.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem first level properties extraction for {cas}', exc_info=True)
try:
# Ricerca del Melting Point
second_layer_props = get_second_layer_props(output['first_pubchem_name'], ['Melting Point', 'Dissociation Constants', 'pH'])
if second_layer_props:
second_layer_props = clean_property_data(second_layer_props)
output = {**output, **second_layer_props}
except Exception as E:
logger.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem second level properties extraction (Melting Point) for {cas}', exc_info=True)
return output

View file

@ -300,24 +300,6 @@ def parse_toxicology_html(html_content):
#endregion #endregion
#region PDF extraction functions
def generate_pdf_from_toxicology_info(index: dict):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(index['toxicological_information_link'])
page.pdf(path=f'pdfs/{index["substance"]["rmlCas"]}.pdf')
browser.close()
if os.path.exists(f'pdfs/{index["substance"]["rmlCas"]}.pdf'):
log.info(f"PDF generated for CAS {index['substance']['rmlCas']}")
return True
else:
log.error(f"PDF generation failed for CAS {index['substance']['rmlCas']}")
return False
#endregion
#region Orchestrator functions #region Orchestrator functions
def echa_flow(cas) -> dict: def echa_flow(cas) -> dict:

View file

@ -0,0 +1,110 @@
import pubchempy as pcp
from pubchemprops.pubchemprops import get_second_layer_props
from pif_compiler.functions.common_log import get_logger
logger = get_logger()
def clean_property_data(api_response):
"""
Simplifies the API response data by flattening nested structures.
Args:
api_response (dict): Raw API response containing property data
Returns:
dict: Cleaned data with simplified structure
"""
cleaned_data = {}
for property_name, measurements in api_response.items():
cleaned_measurements = []
for measurement in measurements:
cleaned_measurement = {
'ReferenceNumber': measurement.get('ReferenceNumber'),
'Description': measurement.get('Description', ''),
}
# Handle Reference field
if 'Reference' in measurement:
# Check if Reference is a list or string
ref = measurement['Reference']
cleaned_measurement['Reference'] = ref[0] if isinstance(ref, list) else ref
# Handle Value field
value = measurement.get('Value', {})
if isinstance(value, dict) and 'StringWithMarkup' in value:
cleaned_measurement['Value'] = value['StringWithMarkup'][0]['String']
else:
cleaned_measurement['Value'] = str(value)
# Remove empty values
cleaned_measurement = {k: v for k, v in cleaned_measurement.items() if v}
cleaned_measurements.append(cleaned_measurement)
cleaned_data[property_name] = cleaned_measurements
return cleaned_data
def pubchem_dap(cas):
'''
Data un CAS in input ricerca le informazioni per la scheda di sicurezza su PubChem.
Per estrarre le proprietà di 1o (sinonimi, cid, logP, MolecularWeight, ExactMass, TPSA) livello uso Pubchempy.
Per quelle di 2o livello uso pubchemprops (Melting point)
args:
cas : string
'''
try:
# Ricerca iniziale
logger.info(f"Searching PubChem for CAS: {cas}")
out = pcp.get_synonyms(cas, 'name')
if out:
out = out[0]
output = {'CID' : out['CID'],
'CAS' : cas,
'first_pubchem_name' : out['Synonym'][0],
'pubchem_link' : f"https://pubchem.ncbi.nlm.nih.gov/compound/{out['CID']}"}
logger.info(f"Found PubChem entry for {cas}: CID {out['CID']}")
else:
logger.warning(f"No results on PubChem for {cas}")
return f'No results on PubChem for {cas}'
except Exception as E:
logger.error(f'Error during pubchem search for {cas}', exc_info=True)
return None
try:
# Ricerca delle proprietà
logger.debug(f"Fetching first level properties for CID {output['CID']}")
properties = pcp.get_properties(['xlogp', 'molecular_weight', 'tpsa', 'exact_mass'], identifier = out['CID'], namespace='cid', searchtype=None, as_dataframe=False)
if properties:
output = {**output, **properties[0]}
logger.debug(f"Successfully retrieved first level properties for {cas}")
else:
logger.warning(f"No first level properties found for {cas}")
return output
except Exception as E:
logger.error(f'Error during pubchem first level properties extraction for {cas}', exc_info=True)
try:
# Ricerca del Melting Point
logger.debug(f"Fetching second level properties for {output['first_pubchem_name']}")
second_layer_props = get_second_layer_props(output['first_pubchem_name'], ['Melting Point', 'Dissociation Constants', 'pH'])
if second_layer_props:
second_layer_props = clean_property_data(second_layer_props)
output = {**output, **second_layer_props}
logger.debug(f"Successfully retrieved second level properties for {cas}")
except Exception as E:
logger.error(f'Error during pubchem second level properties extraction for {cas}', exc_info=True)
return output
if __name__ == "__main__":
# Esempio di utilizzo
cas_number = "64-17-5" # CAS per l'etanolo
result = pubchem_dap(cas_number)
print(result)