update: new endpoint for the api (cosing, pubchem, download)
This commit is contained in:
parent
5fd12cb7a7
commit
f04d4f8b3e
8 changed files with 370 additions and 158 deletions
222
src/pif_compiler/api/routes/common.py
Normal file
222
src/pif_compiler/api/routes/common.py
Normal file
|
|
@ -0,0 +1,222 @@
|
|||
from fastapi import APIRouter, HTTPException, status
|
||||
from fastapi.responses import FileResponse
|
||||
from pydantic import BaseModel, Field, HttpUrl
|
||||
from typing import Optional, Dict, Any
|
||||
import os
|
||||
|
||||
from pif_compiler.functions.common_func import generate_pdf
|
||||
from pif_compiler.services.srv_pubchem import pubchem_dap
|
||||
from pif_compiler.functions.common_log import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class GeneratePdfRequest(BaseModel):
|
||||
link: str = Field(..., description="URL of the page to convert to PDF")
|
||||
name: str = Field(..., description="Name for the generated PDF file (without extension)")
|
||||
|
||||
class Config:
|
||||
json_schema_extra = {
|
||||
"example": {
|
||||
"link": "https://example.com/page",
|
||||
"name": "my_document"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class GeneratePdfResponse(BaseModel):
|
||||
success: bool
|
||||
name: str
|
||||
message: str
|
||||
file_path: Optional[str] = None
|
||||
|
||||
|
||||
@router.post("/common/generate-pdf", response_model=GeneratePdfResponse, tags=["Common"])
|
||||
async def generate_pdf_endpoint(request: GeneratePdfRequest):
|
||||
"""
|
||||
Generate a PDF from a web page URL.
|
||||
|
||||
This endpoint uses Playwright to:
|
||||
1. Navigate to the provided URL
|
||||
2. Render the page
|
||||
3. Generate a PDF file
|
||||
4. Save it in the 'pdfs/' directory
|
||||
|
||||
If a PDF with the same name already exists, it will skip generation
|
||||
and return success immediately.
|
||||
|
||||
Args:
|
||||
request: GeneratePdfRequest with the URL and desired PDF name
|
||||
|
||||
Returns:
|
||||
GeneratePdfResponse with success status and file information
|
||||
"""
|
||||
logger.info(f"API request received to generate PDF: name='{request.name}', link='{request.link}'")
|
||||
|
||||
try:
|
||||
result = generate_pdf(request.link, request.name)
|
||||
|
||||
if result:
|
||||
file_path = f"pdfs/{request.name}.pdf"
|
||||
|
||||
# Check if file was already existing or newly created
|
||||
if os.path.exists(file_path):
|
||||
logger.info(f"PDF available for '{request.name}'")
|
||||
return GeneratePdfResponse(
|
||||
success=True,
|
||||
name=request.name,
|
||||
message=f"PDF generated successfully or already exists",
|
||||
file_path=file_path
|
||||
)
|
||||
else:
|
||||
logger.error(f"PDF file not found after generation for '{request.name}'")
|
||||
return GeneratePdfResponse(
|
||||
success=False,
|
||||
name=request.name,
|
||||
message="PDF generation completed but file not found",
|
||||
file_path=None
|
||||
)
|
||||
else:
|
||||
logger.error(f"PDF generation failed for '{request.name}'")
|
||||
return GeneratePdfResponse(
|
||||
success=False,
|
||||
name=request.name,
|
||||
message="PDF generation failed",
|
||||
file_path=None
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating PDF for '{request.name}': {str(e)}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal error while generating PDF: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.get("/common/download-pdf/{name}", response_class=FileResponse, tags=["Common"])
|
||||
async def download_pdf(name: str):
|
||||
"""
|
||||
Download a previously generated PDF file.
|
||||
|
||||
Args:
|
||||
name: Name of the PDF file (without extension)
|
||||
|
||||
Returns:
|
||||
FileResponse with the PDF file for download
|
||||
"""
|
||||
logger.info(f"API request received to download PDF: name='{name}'")
|
||||
|
||||
file_path = f"pdfs/{name}.pdf"
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
logger.warning(f"PDF file not found: {file_path}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=f"PDF file '{name}' not found. Please generate it first using /common/generate-pdf"
|
||||
)
|
||||
|
||||
logger.info(f"Serving PDF file: {file_path}")
|
||||
return FileResponse(
|
||||
path=file_path,
|
||||
media_type="application/pdf",
|
||||
filename=f"{name}.pdf"
|
||||
)
|
||||
|
||||
|
||||
class PubchemRequest(BaseModel):
|
||||
cas: str = Field(..., description="CAS number of the substance to search for in PubChem")
|
||||
|
||||
class Config:
|
||||
json_schema_extra = {
|
||||
"example": {
|
||||
"cas": "64-17-5"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class PubchemResponse(BaseModel):
|
||||
success: bool
|
||||
cas: str
|
||||
data: Optional[Dict[str, Any]] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
@router.post("/common/pubchem", response_model=PubchemResponse, tags=["Common"])
|
||||
async def search_pubchem(request: PubchemRequest):
|
||||
"""
|
||||
Search for substance information in PubChem database.
|
||||
|
||||
This endpoint retrieves comprehensive substance data from PubChem including:
|
||||
- **Basic info**: CID, CAS, first PubChem name, PubChem link
|
||||
- **First level properties**: XLogP, molecular weight, TPSA, exact mass
|
||||
- **Second level properties**: Melting Point, Dissociation Constants, pH
|
||||
|
||||
The data is automatically cleaned and formatted for easier consumption.
|
||||
|
||||
Args:
|
||||
request: PubchemRequest containing the CAS number
|
||||
|
||||
Returns:
|
||||
PubchemResponse with the substance data or error information
|
||||
"""
|
||||
logger.info(f"API request received for PubChem search: CAS={request.cas}")
|
||||
|
||||
try:
|
||||
result = pubchem_dap(request.cas)
|
||||
|
||||
# Check if result is None (error occurred)
|
||||
if result is None:
|
||||
logger.error(f"PubChem search returned None for CAS: {request.cas}")
|
||||
return PubchemResponse(
|
||||
success=False,
|
||||
cas=request.cas,
|
||||
data=None,
|
||||
error="An error occurred while searching PubChem. Please check the logs for details."
|
||||
)
|
||||
|
||||
# Check if result is a string (no results found)
|
||||
if isinstance(result, str):
|
||||
logger.warning(f"No results found in PubChem for CAS: {request.cas}")
|
||||
return PubchemResponse(
|
||||
success=False,
|
||||
cas=request.cas,
|
||||
data=None,
|
||||
error=result
|
||||
)
|
||||
|
||||
# Successful result
|
||||
logger.info(f"Successfully retrieved PubChem data for CAS: {request.cas}")
|
||||
return PubchemResponse(
|
||||
success=True,
|
||||
cas=request.cas,
|
||||
data=result,
|
||||
error=None
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing PubChem request for CAS {request.cas}: {str(e)}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal error while processing PubChem request: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.get("/common/health", tags=["Common"])
|
||||
async def common_health_check():
|
||||
"""
|
||||
Health check endpoint for common functions service.
|
||||
|
||||
Returns the status of the common functions components.
|
||||
"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": "common-functions",
|
||||
"components": {
|
||||
"api": "operational",
|
||||
"logging": "operational",
|
||||
"utilities": "operational",
|
||||
"pubchem": "operational"
|
||||
}
|
||||
}
|
||||
25
src/pif_compiler/functions/common_func.py
Normal file
25
src/pif_compiler/functions/common_func.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
from playwright.sync_api import sync_playwright
|
||||
import os
|
||||
|
||||
from pif_compiler.functions.common_log import get_logger
|
||||
|
||||
log = get_logger()
|
||||
|
||||
def generate_pdf(link : str, name : str):
|
||||
if os.path.exists(f'pdfs/{name}.pdf'):
|
||||
log.info(f"PDF already exists for {name}, skipping generation.")
|
||||
return True
|
||||
else:
|
||||
log.info(f"Generating PDF for {name} from link: {link}")
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch()
|
||||
page = browser.new_page()
|
||||
page.goto(link)
|
||||
page.pdf(path=f'pdfs/{name}.pdf')
|
||||
browser.close()
|
||||
if os.path.exists(f'pdfs/{name}.pdf'):
|
||||
log.info(f"PDF generated for {name}")
|
||||
return True
|
||||
else:
|
||||
log.error(f"PDF generation failed for {name}")
|
||||
return False
|
||||
|
|
@ -8,7 +8,7 @@ import time
|
|||
from pif_compiler.functions.common_log import get_logger
|
||||
|
||||
# Import dei tuoi router
|
||||
from pif_compiler.api.routes import api_echa
|
||||
from pif_compiler.api.routes import api_echa, api_cosing, common
|
||||
|
||||
# Configurazione logging
|
||||
logger = get_logger()
|
||||
|
|
@ -123,6 +123,17 @@ app.include_router(
|
|||
tags=["ECHA"]
|
||||
)
|
||||
|
||||
app.include_router(
|
||||
api_cosing.router,
|
||||
prefix="/api/v1",
|
||||
tags=["COSING"]
|
||||
)
|
||||
|
||||
app.include_router(
|
||||
common.router,
|
||||
prefix="/api/v1",
|
||||
tags=["Common"]
|
||||
)
|
||||
|
||||
# ==================== ROOT ENDPOINTS ====================
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ from pif_compiler.services.srv_cosing import (
|
|||
)
|
||||
|
||||
# PubChem Service
|
||||
from pif_compiler.services.pubchem_service import (
|
||||
from pif_compiler.services.srv_pubchem import (
|
||||
pubchem_dap,
|
||||
clean_property_data,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,138 +0,0 @@
|
|||
import os
|
||||
from contextlib import contextmanager
|
||||
import pubchempy as pcp
|
||||
from pubchemprops.pubchemprops import get_second_layer_props
|
||||
|
||||
from pif_compiler.functions.common_log import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
@contextmanager
|
||||
def temporary_certificate(cert_path):
|
||||
"""
|
||||
Context manager to temporarily change the certificate used for requests.
|
||||
|
||||
Args:
|
||||
cert_path (str): Path to the certificate file to use temporarily
|
||||
|
||||
Example:
|
||||
# Regular request uses default certificates
|
||||
requests.get('https://api.example.com')
|
||||
|
||||
# Use custom certificate only within this block
|
||||
with temporary_certificate('custom-cert.pem'):
|
||||
requests.get('https://api.requiring.custom.cert.com')
|
||||
|
||||
# Back to default certificates
|
||||
requests.get('https://api.example.com')
|
||||
"""
|
||||
# Store original environment variables
|
||||
original_ca_bundle = os.environ.get('REQUESTS_CA_BUNDLE')
|
||||
original_ssl_cert = os.environ.get('SSL_CERT_FILE')
|
||||
|
||||
try:
|
||||
# Set new certificate
|
||||
os.environ['REQUESTS_CA_BUNDLE'] = cert_path
|
||||
os.environ['SSL_CERT_FILE'] = cert_path
|
||||
yield
|
||||
finally:
|
||||
# Restore original environment variables
|
||||
if original_ca_bundle is not None:
|
||||
os.environ['REQUESTS_CA_BUNDLE'] = original_ca_bundle
|
||||
else:
|
||||
os.environ.pop('REQUESTS_CA_BUNDLE', None)
|
||||
|
||||
if original_ssl_cert is not None:
|
||||
os.environ['SSL_CERT_FILE'] = original_ssl_cert
|
||||
else:
|
||||
os.environ.pop('SSL_CERT_FILE', None)
|
||||
|
||||
def clean_property_data(api_response):
|
||||
"""
|
||||
Simplifies the API response data by flattening nested structures.
|
||||
|
||||
Args:
|
||||
api_response (dict): Raw API response containing property data
|
||||
|
||||
Returns:
|
||||
dict: Cleaned data with simplified structure
|
||||
"""
|
||||
cleaned_data = {}
|
||||
|
||||
for property_name, measurements in api_response.items():
|
||||
cleaned_measurements = []
|
||||
|
||||
for measurement in measurements:
|
||||
cleaned_measurement = {
|
||||
'ReferenceNumber': measurement.get('ReferenceNumber'),
|
||||
'Description': measurement.get('Description', ''),
|
||||
}
|
||||
|
||||
# Handle Reference field
|
||||
if 'Reference' in measurement:
|
||||
# Check if Reference is a list or string
|
||||
ref = measurement['Reference']
|
||||
cleaned_measurement['Reference'] = ref[0] if isinstance(ref, list) else ref
|
||||
|
||||
# Handle Value field
|
||||
value = measurement.get('Value', {})
|
||||
if isinstance(value, dict) and 'StringWithMarkup' in value:
|
||||
cleaned_measurement['Value'] = value['StringWithMarkup'][0]['String']
|
||||
else:
|
||||
cleaned_measurement['Value'] = str(value)
|
||||
|
||||
# Remove empty values
|
||||
cleaned_measurement = {k: v for k, v in cleaned_measurement.items() if v}
|
||||
|
||||
cleaned_measurements.append(cleaned_measurement)
|
||||
|
||||
cleaned_data[property_name] = cleaned_measurements
|
||||
|
||||
return cleaned_data
|
||||
|
||||
def pubchem_dap(cas):
|
||||
'''
|
||||
Data un CAS in input ricerca le informazioni per la scheda di sicurezza su PubChem.
|
||||
Per estrarre le proprietà di 1o (sinonimi, cid, logP, MolecularWeight, ExactMass, TPSA) livello uso Pubchempy.
|
||||
Per quelle di 2o livello uso pubchemprops (Melting point)
|
||||
|
||||
args:
|
||||
cas : string
|
||||
|
||||
'''
|
||||
with temporary_certificate('src/data/ncbi-nlm-nih-gov-catena.pem'):
|
||||
try:
|
||||
# Ricerca iniziale
|
||||
out = pcp.get_synonyms(cas, 'name')
|
||||
if out:
|
||||
out = out[0]
|
||||
output = {'CID' : out['CID'],
|
||||
'CAS' : cas,
|
||||
'first_pubchem_name' : out['Synonym'][0],
|
||||
'pubchem_link' : f"https://pubchem.ncbi.nlm.nih.gov/compound/{out['CID']}"}
|
||||
else:
|
||||
return f'No results on PubChem for {cas}'
|
||||
|
||||
except Exception as E:
|
||||
logger.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem search for {cas}', exc_info=True)
|
||||
|
||||
try:
|
||||
# Ricerca delle proprietà
|
||||
properties = pcp.get_properties(['xlogp', 'molecular_weight', 'tpsa', 'exact_mass'], identifier = out['CID'], namespace='cid', searchtype=None, as_dataframe=False)
|
||||
if properties:
|
||||
output = {**output, **properties[0]}
|
||||
else:
|
||||
return output
|
||||
except Exception as E:
|
||||
logger.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem first level properties extraction for {cas}', exc_info=True)
|
||||
|
||||
try:
|
||||
# Ricerca del Melting Point
|
||||
second_layer_props = get_second_layer_props(output['first_pubchem_name'], ['Melting Point', 'Dissociation Constants', 'pH'])
|
||||
if second_layer_props:
|
||||
second_layer_props = clean_property_data(second_layer_props)
|
||||
output = {**output, **second_layer_props}
|
||||
except Exception as E:
|
||||
logger.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem second level properties extraction (Melting Point) for {cas}', exc_info=True)
|
||||
|
||||
return output
|
||||
|
|
@ -300,24 +300,6 @@ def parse_toxicology_html(html_content):
|
|||
|
||||
#endregion
|
||||
|
||||
#region PDF extraction functions
|
||||
|
||||
def generate_pdf_from_toxicology_info(index: dict):
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch()
|
||||
page = browser.new_page()
|
||||
page.goto(index['toxicological_information_link'])
|
||||
page.pdf(path=f'pdfs/{index["substance"]["rmlCas"]}.pdf')
|
||||
browser.close()
|
||||
if os.path.exists(f'pdfs/{index["substance"]["rmlCas"]}.pdf'):
|
||||
log.info(f"PDF generated for CAS {index['substance']['rmlCas']}")
|
||||
return True
|
||||
else:
|
||||
log.error(f"PDF generation failed for CAS {index['substance']['rmlCas']}")
|
||||
return False
|
||||
|
||||
#endregion
|
||||
|
||||
#region Orchestrator functions
|
||||
|
||||
def echa_flow(cas) -> dict:
|
||||
|
|
|
|||
110
src/pif_compiler/services/srv_pubchem.py
Normal file
110
src/pif_compiler/services/srv_pubchem.py
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
import pubchempy as pcp
|
||||
from pubchemprops.pubchemprops import get_second_layer_props
|
||||
|
||||
from pif_compiler.functions.common_log import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
def clean_property_data(api_response):
|
||||
"""
|
||||
Simplifies the API response data by flattening nested structures.
|
||||
|
||||
Args:
|
||||
api_response (dict): Raw API response containing property data
|
||||
|
||||
Returns:
|
||||
dict: Cleaned data with simplified structure
|
||||
"""
|
||||
cleaned_data = {}
|
||||
|
||||
for property_name, measurements in api_response.items():
|
||||
cleaned_measurements = []
|
||||
|
||||
for measurement in measurements:
|
||||
cleaned_measurement = {
|
||||
'ReferenceNumber': measurement.get('ReferenceNumber'),
|
||||
'Description': measurement.get('Description', ''),
|
||||
}
|
||||
|
||||
# Handle Reference field
|
||||
if 'Reference' in measurement:
|
||||
# Check if Reference is a list or string
|
||||
ref = measurement['Reference']
|
||||
cleaned_measurement['Reference'] = ref[0] if isinstance(ref, list) else ref
|
||||
|
||||
# Handle Value field
|
||||
value = measurement.get('Value', {})
|
||||
if isinstance(value, dict) and 'StringWithMarkup' in value:
|
||||
cleaned_measurement['Value'] = value['StringWithMarkup'][0]['String']
|
||||
else:
|
||||
cleaned_measurement['Value'] = str(value)
|
||||
|
||||
# Remove empty values
|
||||
cleaned_measurement = {k: v for k, v in cleaned_measurement.items() if v}
|
||||
|
||||
cleaned_measurements.append(cleaned_measurement)
|
||||
|
||||
cleaned_data[property_name] = cleaned_measurements
|
||||
|
||||
return cleaned_data
|
||||
|
||||
def pubchem_dap(cas):
|
||||
'''
|
||||
Data un CAS in input ricerca le informazioni per la scheda di sicurezza su PubChem.
|
||||
Per estrarre le proprietà di 1o (sinonimi, cid, logP, MolecularWeight, ExactMass, TPSA) livello uso Pubchempy.
|
||||
Per quelle di 2o livello uso pubchemprops (Melting point)
|
||||
|
||||
args:
|
||||
cas : string
|
||||
|
||||
'''
|
||||
try:
|
||||
# Ricerca iniziale
|
||||
logger.info(f"Searching PubChem for CAS: {cas}")
|
||||
out = pcp.get_synonyms(cas, 'name')
|
||||
if out:
|
||||
out = out[0]
|
||||
output = {'CID' : out['CID'],
|
||||
'CAS' : cas,
|
||||
'first_pubchem_name' : out['Synonym'][0],
|
||||
'pubchem_link' : f"https://pubchem.ncbi.nlm.nih.gov/compound/{out['CID']}"}
|
||||
logger.info(f"Found PubChem entry for {cas}: CID {out['CID']}")
|
||||
else:
|
||||
logger.warning(f"No results on PubChem for {cas}")
|
||||
return f'No results on PubChem for {cas}'
|
||||
|
||||
except Exception as E:
|
||||
logger.error(f'Error during pubchem search for {cas}', exc_info=True)
|
||||
return None
|
||||
|
||||
try:
|
||||
# Ricerca delle proprietà
|
||||
logger.debug(f"Fetching first level properties for CID {output['CID']}")
|
||||
properties = pcp.get_properties(['xlogp', 'molecular_weight', 'tpsa', 'exact_mass'], identifier = out['CID'], namespace='cid', searchtype=None, as_dataframe=False)
|
||||
if properties:
|
||||
output = {**output, **properties[0]}
|
||||
logger.debug(f"Successfully retrieved first level properties for {cas}")
|
||||
else:
|
||||
logger.warning(f"No first level properties found for {cas}")
|
||||
return output
|
||||
except Exception as E:
|
||||
logger.error(f'Error during pubchem first level properties extraction for {cas}', exc_info=True)
|
||||
|
||||
try:
|
||||
# Ricerca del Melting Point
|
||||
logger.debug(f"Fetching second level properties for {output['first_pubchem_name']}")
|
||||
second_layer_props = get_second_layer_props(output['first_pubchem_name'], ['Melting Point', 'Dissociation Constants', 'pH'])
|
||||
if second_layer_props:
|
||||
second_layer_props = clean_property_data(second_layer_props)
|
||||
output = {**output, **second_layer_props}
|
||||
logger.debug(f"Successfully retrieved second level properties for {cas}")
|
||||
except Exception as E:
|
||||
logger.error(f'Error during pubchem second level properties extraction for {cas}', exc_info=True)
|
||||
|
||||
return output
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Esempio di utilizzo
|
||||
cas_number = "64-17-5" # CAS per l'etanolo
|
||||
result = pubchem_dap(cas_number)
|
||||
print(result)
|
||||
Loading…
Reference in a new issue