cosmoguard-bd/old/_old/pubchem.py
2025-11-15 16:02:37 +01:00

149 lines
5.6 KiB
Python

import os
from contextlib import contextmanager
import pubchempy as pcp
from pubchemprops.pubchemprops import get_second_layer_props
import logging
logging.basicConfig(
format="{asctime} - {levelname} - {message}",
style="{",
datefmt="%Y-%m-%d %H:%M",
filename="echa.log",
encoding="utf-8",
filemode="a",
level=logging.INFO,
)
@contextmanager
def temporary_certificate(cert_path):
# Sto robo serve perchè per usare l'API di PubChem serve cambiare temporaneamente il certificato con il quale
# si fanno le richieste
"""
Context manager to temporarily change the certificate used for requests.
Args:
cert_path (str): Path to the certificate file to use temporarily
Example:
# Regular request uses default certificates
requests.get('https://api.example.com')
# Use custom certificate only within this block
with temporary_certificate('custom-cert.pem'):
requests.get('https://api.requiring.custom.cert.com')
# Back to default certificates
requests.get('https://api.example.com')
"""
# Store original environment variables
original_ca_bundle = os.environ.get('REQUESTS_CA_BUNDLE')
original_ssl_cert = os.environ.get('SSL_CERT_FILE')
try:
# Set new certificate
os.environ['REQUESTS_CA_BUNDLE'] = cert_path
os.environ['SSL_CERT_FILE'] = cert_path
yield
finally:
# Restore original environment variables
if original_ca_bundle is not None:
os.environ['REQUESTS_CA_BUNDLE'] = original_ca_bundle
else:
os.environ.pop('REQUESTS_CA_BUNDLE', None)
if original_ssl_cert is not None:
os.environ['SSL_CERT_FILE'] = original_ssl_cert
else:
os.environ.pop('SSL_CERT_FILE', None)
def clean_property_data(api_response):
"""
Simplifies the API response data by flattening nested structures.
Args:
api_response (dict): Raw API response containing property data
Returns:
dict: Cleaned data with simplified structure
"""
cleaned_data = {}
for property_name, measurements in api_response.items():
cleaned_measurements = []
for measurement in measurements:
cleaned_measurement = {
'ReferenceNumber': measurement.get('ReferenceNumber'),
'Description': measurement.get('Description', ''),
}
# Handle Reference field
if 'Reference' in measurement:
# Check if Reference is a list or string
ref = measurement['Reference']
cleaned_measurement['Reference'] = ref[0] if isinstance(ref, list) else ref
# Handle Value field
value = measurement.get('Value', {})
if isinstance(value, dict) and 'StringWithMarkup' in value:
cleaned_measurement['Value'] = value['StringWithMarkup'][0]['String']
else:
cleaned_measurement['Value'] = str(value)
# Remove empty values
cleaned_measurement = {k: v for k, v in cleaned_measurement.items() if v}
cleaned_measurements.append(cleaned_measurement)
cleaned_data[property_name] = cleaned_measurements
return cleaned_data
def pubchem_dap(cas):
'''
Data un CAS in input ricerca le informazioni per la scheda di sicurezza su PubChem.
Per estrarre le proprietà di 1o (sinonimi, cid, logP, MolecularWeight, ExactMass, TPSA) livello uso Pubchempy.
Per quelle di 2o livello uso pubchemprops (Melting point)
args:
cas : string
'''
with temporary_certificate('src/data/ncbi-nlm-nih-gov-catena.pem'):
try:
# Ricerca iniziale
out = pcp.get_synonyms(cas, 'name')
if out:
out = out[0]
output = {'CID' : out['CID'],
'CAS' : cas,
'first_pubchem_name' : out['Synonym'][0],
'pubchem_link' : f"https://pubchem.ncbi.nlm.nih.gov/compound/{out['CID']}"}
else:
return f'No results on PubChem for {cas}'
except Exception as E:
logging.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem search for {cas}', exc_info=True)
try:
# Ricerca delle proprietà
properties = pcp.get_properties(['xlogp', 'molecular_weight', 'tpsa', 'exact_mass'], identifier = out['CID'], namespace='cid', searchtype=None, as_dataframe=False)
if properties:
output = {**output, **properties[0]}
else:
return output
except Exception as E:
logging.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem first level properties extraction for {cas}', exc_info=True)
try:
# Ricerca del Melting Point
second_layer_props = get_second_layer_props(output['first_pubchem_name'], ['Melting Point', 'Dissociation Constants', 'pH'])
if second_layer_props:
second_layer_props = clean_property_data(second_layer_props)
output = {**output, **second_layer_props}
except Exception as E:
logging.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem second level properties extraction (Melting Point) for {cas}', exc_info=True)
return output