149 lines
5.6 KiB
Python
149 lines
5.6 KiB
Python
|
|
import os
|
|
from contextlib import contextmanager
|
|
import pubchempy as pcp
|
|
from pubchemprops.pubchemprops import get_second_layer_props
|
|
import logging
|
|
|
|
logging.basicConfig(
|
|
format="{asctime} - {levelname} - {message}",
|
|
style="{",
|
|
datefmt="%Y-%m-%d %H:%M",
|
|
filename="echa.log",
|
|
encoding="utf-8",
|
|
filemode="a",
|
|
level=logging.INFO,
|
|
)
|
|
|
|
@contextmanager
|
|
def temporary_certificate(cert_path):
|
|
# Sto robo serve perchè per usare l'API di PubChem serve cambiare temporaneamente il certificato con il quale
|
|
# si fanno le richieste
|
|
|
|
"""
|
|
Context manager to temporarily change the certificate used for requests.
|
|
|
|
Args:
|
|
cert_path (str): Path to the certificate file to use temporarily
|
|
|
|
Example:
|
|
# Regular request uses default certificates
|
|
requests.get('https://api.example.com')
|
|
|
|
# Use custom certificate only within this block
|
|
with temporary_certificate('custom-cert.pem'):
|
|
requests.get('https://api.requiring.custom.cert.com')
|
|
|
|
# Back to default certificates
|
|
requests.get('https://api.example.com')
|
|
"""
|
|
# Store original environment variables
|
|
original_ca_bundle = os.environ.get('REQUESTS_CA_BUNDLE')
|
|
original_ssl_cert = os.environ.get('SSL_CERT_FILE')
|
|
|
|
try:
|
|
# Set new certificate
|
|
os.environ['REQUESTS_CA_BUNDLE'] = cert_path
|
|
os.environ['SSL_CERT_FILE'] = cert_path
|
|
yield
|
|
finally:
|
|
# Restore original environment variables
|
|
if original_ca_bundle is not None:
|
|
os.environ['REQUESTS_CA_BUNDLE'] = original_ca_bundle
|
|
else:
|
|
os.environ.pop('REQUESTS_CA_BUNDLE', None)
|
|
|
|
if original_ssl_cert is not None:
|
|
os.environ['SSL_CERT_FILE'] = original_ssl_cert
|
|
else:
|
|
os.environ.pop('SSL_CERT_FILE', None)
|
|
|
|
def clean_property_data(api_response):
|
|
"""
|
|
Simplifies the API response data by flattening nested structures.
|
|
|
|
Args:
|
|
api_response (dict): Raw API response containing property data
|
|
|
|
Returns:
|
|
dict: Cleaned data with simplified structure
|
|
"""
|
|
cleaned_data = {}
|
|
|
|
for property_name, measurements in api_response.items():
|
|
cleaned_measurements = []
|
|
|
|
for measurement in measurements:
|
|
cleaned_measurement = {
|
|
'ReferenceNumber': measurement.get('ReferenceNumber'),
|
|
'Description': measurement.get('Description', ''),
|
|
}
|
|
|
|
# Handle Reference field
|
|
if 'Reference' in measurement:
|
|
# Check if Reference is a list or string
|
|
ref = measurement['Reference']
|
|
cleaned_measurement['Reference'] = ref[0] if isinstance(ref, list) else ref
|
|
|
|
# Handle Value field
|
|
value = measurement.get('Value', {})
|
|
if isinstance(value, dict) and 'StringWithMarkup' in value:
|
|
cleaned_measurement['Value'] = value['StringWithMarkup'][0]['String']
|
|
else:
|
|
cleaned_measurement['Value'] = str(value)
|
|
|
|
# Remove empty values
|
|
cleaned_measurement = {k: v for k, v in cleaned_measurement.items() if v}
|
|
|
|
cleaned_measurements.append(cleaned_measurement)
|
|
|
|
cleaned_data[property_name] = cleaned_measurements
|
|
|
|
return cleaned_data
|
|
|
|
def pubchem_dap(cas):
|
|
'''
|
|
Data un CAS in input ricerca le informazioni per la scheda di sicurezza su PubChem.
|
|
Per estrarre le proprietà di 1o (sinonimi, cid, logP, MolecularWeight, ExactMass, TPSA) livello uso Pubchempy.
|
|
Per quelle di 2o livello uso pubchemprops (Melting point)
|
|
|
|
args:
|
|
cas : string
|
|
|
|
'''
|
|
with temporary_certificate('src/data/ncbi-nlm-nih-gov-catena.pem'):
|
|
try:
|
|
# Ricerca iniziale
|
|
out = pcp.get_synonyms(cas, 'name')
|
|
if out:
|
|
out = out[0]
|
|
output = {'CID' : out['CID'],
|
|
'CAS' : cas,
|
|
'first_pubchem_name' : out['Synonym'][0],
|
|
'pubchem_link' : f"https://pubchem.ncbi.nlm.nih.gov/compound/{out['CID']}"}
|
|
else:
|
|
return f'No results on PubChem for {cas}'
|
|
|
|
except Exception as E:
|
|
logging.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem search for {cas}', exc_info=True)
|
|
|
|
try:
|
|
# Ricerca delle proprietà
|
|
properties = pcp.get_properties(['xlogp', 'molecular_weight', 'tpsa', 'exact_mass'], identifier = out['CID'], namespace='cid', searchtype=None, as_dataframe=False)
|
|
if properties:
|
|
output = {**output, **properties[0]}
|
|
else:
|
|
return output
|
|
except Exception as E:
|
|
logging.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem first level properties extraction for {cas}', exc_info=True)
|
|
|
|
try:
|
|
# Ricerca del Melting Point
|
|
second_layer_props = get_second_layer_props(output['first_pubchem_name'], ['Melting Point', 'Dissociation Constants', 'pH'])
|
|
if second_layer_props:
|
|
second_layer_props = clean_property_data(second_layer_props)
|
|
output = {**output, **second_layer_props}
|
|
except Exception as E:
|
|
logging.error(f'various_utils.pubchem.pubchem_dap(). Some error during pubchem second level properties extraction (Melting Point) for {cas}', exc_info=True)
|
|
|
|
return output
|