cosmoguard-bd/tests/test_echa_find.py

857 lines
29 KiB
Python

"""
Tests for ECHA Find Service
Test coverage:
- search_dossier: Complete workflow for searching ECHA dossiers
- Substance search (by CAS, EC, rmlName)
- Dossier retrieval (Active/Inactive)
- HTML parsing for toxicology sections
- Error handling and edge cases
"""
import pytest
from unittest.mock import Mock, patch, MagicMock
from datetime import datetime
from pif_compiler.services.echa_find import search_dossier
class TestSearchDossierSubstanceSearch:
"""Test the initial substance search phase of search_dossier."""
@patch('pif_compiler.services.echa_find.requests.get')
def test_successful_cas_search(self, mock_get):
"""Test successful search by CAS number."""
# Mock the substance search API response
mock_response = Mock()
mock_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": "Test Substance",
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
mock_get.return_value = mock_response
# Mocking all subsequent calls
with patch('pif_compiler.services.echa_find.requests.get') as mock_all_gets:
# First call: substance search (already mocked above)
# Second call: dossier list
mock_dossier_response = Mock()
mock_dossier_response.json.return_value = {
"items": [{
"assetExternalId": "abc123",
"rootKey": "key123",
"lastUpdatedDate": "2024-01-15T10:30:00Z"
}]
}
# Third call: index.html page
mock_index_response = Mock()
mock_index_response.text = """
<html>
<div id="id_7_Toxicologicalinformation">
<a href="tox_summary_001"></a>
</div>
<div id="id_72_AcuteToxicity">
<a href="acute_tox_001"></a>
</div>
<div id="id_75_Repeateddosetoxicity">
<a href="repeated_dose_001"></a>
</div>
</html>
"""
mock_all_gets.side_effect = [
mock_response,
mock_dossier_response,
mock_index_response
]
result = search_dossier("50-00-0", input_type="rmlCas")
assert result is not False
assert result["rmlCas"] == "50-00-0"
assert result["rmlName"] == "Test Substance"
assert result["rmlId"] == "100.000.001"
assert result["rmlEc"] == "200-001-8"
@patch('pif_compiler.services.echa_find.requests.get')
def test_successful_ec_search(self, mock_get):
"""Test successful search by EC number."""
mock_response = Mock()
mock_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": "Test Substance",
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
mock_get.return_value = mock_response
with patch('pif_compiler.services.echa_find.requests.get') as mock_all_gets:
mock_dossier_response = Mock()
mock_dossier_response.json.return_value = {
"items": [{
"assetExternalId": "abc123",
"rootKey": "key123",
"lastUpdatedDate": "2024-01-15T10:30:00Z"
}]
}
mock_index_response = Mock()
mock_index_response.text = "<html><div id='id_7_Toxicologicalinformation'><a href='tox_001'></a></div></html>"
mock_all_gets.side_effect = [
mock_response,
mock_dossier_response,
mock_index_response
]
result = search_dossier("200-001-8", input_type="rmlEc")
assert result is not False
assert result["rmlEc"] == "200-001-8"
@patch('pif_compiler.services.echa_find.requests.get')
def test_successful_name_search(self, mock_get):
"""Test successful search by substance name."""
mock_response = Mock()
mock_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": "formaldehyde",
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
mock_get.return_value = mock_response
with patch('pif_compiler.services.echa_find.requests.get') as mock_all_gets:
mock_dossier_response = Mock()
mock_dossier_response.json.return_value = {
"items": [{
"assetExternalId": "abc123",
"rootKey": "key123",
"lastUpdatedDate": "2024-01-15T10:30:00Z"
}]
}
mock_index_response = Mock()
mock_index_response.text = "<html><div id='id_7_Toxicologicalinformation'><a href='tox_001'></a></div></html>"
mock_all_gets.side_effect = [
mock_response,
mock_dossier_response,
mock_index_response
]
result = search_dossier("formaldehyde", input_type="rmlName")
assert result is not False
assert result["rmlName"] == "formaldehyde"
@patch('pif_compiler.services.echa_find.requests.get')
def test_substance_not_found(self, mock_get):
"""Test when substance is not found in ECHA."""
mock_response = Mock()
mock_response.json.return_value = {"items": []}
mock_get.return_value = mock_response
result = search_dossier("999-99-9", input_type="rmlCas")
assert result is False
@patch('pif_compiler.services.echa_find.requests.get')
def test_empty_items_array(self, mock_get):
"""Test when API returns empty items array."""
mock_response = Mock()
mock_response.json.return_value = {"items": []}
mock_get.return_value = mock_response
result = search_dossier("NONEXISTENT", input_type="rmlName")
assert result is False
@patch('pif_compiler.services.echa_find.requests.get')
def test_malformed_api_response(self, mock_get):
"""Test when API response is malformed."""
mock_response = Mock()
mock_response.json.return_value = {} # Missing 'items' key
mock_get.return_value = mock_response
result = search_dossier("50-00-0", input_type="rmlCas")
assert result is False
class TestSearchDossierInputTypeValidation:
"""Test input_type parameter validation."""
@patch('pif_compiler.services.echa_find.requests.get')
def test_input_type_mismatch_cas(self, mock_get):
"""Test when input_type doesn't match actual search result (CAS)."""
mock_response = Mock()
mock_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": "Test Substance",
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
mock_get.return_value = mock_response
# Search with CAS but specify wrong input_type
result = search_dossier("50-00-0", input_type="rmlEc")
assert isinstance(result, str)
assert "search_error" in result
assert "not equal" in result
@patch('pif_compiler.services.echa_find.requests.get')
def test_input_type_correct_match(self, mock_get):
"""Test when input_type correctly matches search result."""
mock_response = Mock()
mock_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": "Test Substance",
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
mock_get.return_value = mock_response
with patch('pif_compiler.services.echa_find.requests.get') as mock_all_gets:
mock_dossier_response = Mock()
mock_dossier_response.json.return_value = {
"items": [{
"assetExternalId": "abc123",
"rootKey": "key123",
"lastUpdatedDate": "2024-01-15T10:30:00Z"
}]
}
mock_index_response = Mock()
mock_index_response.text = "<html><div id='id_7_Toxicologicalinformation'><a href='tox_001'></a></div></html>"
mock_all_gets.side_effect = [
mock_response,
mock_dossier_response,
mock_index_response
]
result = search_dossier("50-00-0", input_type="rmlCas")
assert result is not False
assert isinstance(result, dict)
class TestSearchDossierDossierRetrieval:
"""Test dossier retrieval (Active/Inactive)."""
@patch('pif_compiler.services.echa_find.requests.get')
def test_active_dossier_found(self, mock_get):
"""Test when active dossier is found."""
mock_substance_response = Mock()
mock_substance_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": "Test Substance",
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
mock_dossier_response = Mock()
mock_dossier_response.json.return_value = {
"items": [{
"assetExternalId": "abc123",
"rootKey": "key123",
"lastUpdatedDate": "2024-01-15T10:30:00Z"
}]
}
mock_index_response = Mock()
mock_index_response.text = "<html><div id='id_7_Toxicologicalinformation'><a href='tox_001'></a></div></html>"
mock_get.side_effect = [
mock_substance_response,
mock_dossier_response,
mock_index_response
]
result = search_dossier("50-00-0", input_type="rmlCas")
assert result is not False
assert result["dossierType"] == "Active"
@patch('pif_compiler.services.echa_find.requests.get')
def test_inactive_dossier_fallback(self, mock_get):
"""Test when only inactive dossier exists (fallback)."""
mock_substance_response = Mock()
mock_substance_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": "Test Substance",
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
# First dossier call returns empty (no active)
mock_active_dossier_response = Mock()
mock_active_dossier_response.json.return_value = {"items": []}
# Second dossier call returns inactive
mock_inactive_dossier_response = Mock()
mock_inactive_dossier_response.json.return_value = {
"items": [{
"assetExternalId": "abc123",
"rootKey": "key123",
"lastUpdatedDate": "2024-01-15T10:30:00Z"
}]
}
mock_index_response = Mock()
mock_index_response.text = "<html><div id='id_7_Toxicologicalinformation'><a href='tox_001'></a></div></html>"
mock_get.side_effect = [
mock_substance_response,
mock_active_dossier_response,
mock_inactive_dossier_response,
mock_index_response
]
result = search_dossier("50-00-0", input_type="rmlCas")
assert result is not False
assert result["dossierType"] == "Inactive"
@patch('pif_compiler.services.echa_find.requests.get')
def test_no_dossiers_found(self, mock_get):
"""Test when no dossiers (active or inactive) are found."""
mock_substance_response = Mock()
mock_substance_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": "Test Substance",
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
# Both active and inactive return empty
mock_empty_response = Mock()
mock_empty_response.json.return_value = {"items": []}
mock_get.side_effect = [
mock_substance_response,
mock_empty_response, # Active
mock_empty_response # Inactive
]
result = search_dossier("50-00-0", input_type="rmlCas")
assert result is False
@patch('pif_compiler.services.echa_find.requests.get')
def test_last_update_date_parsed(self, mock_get):
"""Test that lastUpdateDate is correctly parsed."""
mock_substance_response = Mock()
mock_substance_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": "Test Substance",
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
mock_dossier_response = Mock()
mock_dossier_response.json.return_value = {
"items": [{
"assetExternalId": "abc123",
"rootKey": "key123",
"lastUpdatedDate": "2024-01-15T10:30:00Z"
}]
}
mock_index_response = Mock()
mock_index_response.text = "<html><div id='id_7_Toxicologicalinformation'><a href='tox_001'></a></div></html>"
mock_get.side_effect = [
mock_substance_response,
mock_dossier_response,
mock_index_response
]
result = search_dossier("50-00-0", input_type="rmlCas")
assert result is not False
assert "lastUpdateDate" in result
assert result["lastUpdateDate"] == "2024-01-15"
@patch('pif_compiler.services.echa_find.requests.get')
def test_missing_last_update_date(self, mock_get):
"""Test when lastUpdateDate is missing from response."""
mock_substance_response = Mock()
mock_substance_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": "Test Substance",
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
mock_dossier_response = Mock()
mock_dossier_response.json.return_value = {
"items": [{
"assetExternalId": "abc123",
"rootKey": "key123"
# lastUpdatedDate missing
}]
}
mock_index_response = Mock()
mock_index_response.text = "<html><div id='id_7_Toxicologicalinformation'><a href='tox_001'></a></div></html>"
mock_get.side_effect = [
mock_substance_response,
mock_dossier_response,
mock_index_response
]
result = search_dossier("50-00-0", input_type="rmlCas")
assert result is not False
# Should still work, just without lastUpdateDate
assert "lastUpdateDate" not in result
class TestSearchDossierHTMLParsing:
"""Test HTML parsing for toxicology sections."""
@patch('pif_compiler.services.echa_find.requests.get')
def test_all_tox_sections_found(self, mock_get):
"""Test when all toxicology sections are found."""
mock_substance_response = Mock()
mock_substance_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": "Test Substance",
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
mock_dossier_response = Mock()
mock_dossier_response.json.return_value = {
"items": [{
"assetExternalId": "abc123",
"rootKey": "key123",
"lastUpdatedDate": "2024-01-15T10:30:00Z"
}]
}
mock_index_response = Mock()
mock_index_response.text = """
<html>
<div id="id_7_Toxicologicalinformation">
<a href="tox_summary_001"></a>
</div>
<div id="id_72_AcuteToxicity">
<a href="acute_tox_001"></a>
</div>
<div id="id_75_Repeateddosetoxicity">
<a href="repeated_dose_001"></a>
</div>
</html>
"""
mock_get.side_effect = [
mock_substance_response,
mock_dossier_response,
mock_index_response
]
result = search_dossier("50-00-0", input_type="rmlCas")
assert result is not False
assert "ToxSummary" in result
assert "AcuteToxicity" in result
assert "RepeatedDose" in result
assert "tox_summary_001" in result["ToxSummary"]
assert "acute_tox_001" in result["AcuteToxicity"]
assert "repeated_dose_001" in result["RepeatedDose"]
@patch('pif_compiler.services.echa_find.requests.get')
def test_only_tox_summary_found(self, mock_get):
"""Test when only ToxSummary section exists."""
mock_substance_response = Mock()
mock_substance_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": "Test Substance",
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
mock_dossier_response = Mock()
mock_dossier_response.json.return_value = {
"items": [{
"assetExternalId": "abc123",
"rootKey": "key123",
"lastUpdatedDate": "2024-01-15T10:30:00Z"
}]
}
mock_index_response = Mock()
mock_index_response.text = """
<html>
<div id="id_7_Toxicologicalinformation">
<a href="tox_summary_001"></a>
</div>
</html>
"""
mock_get.side_effect = [
mock_substance_response,
mock_dossier_response,
mock_index_response
]
result = search_dossier("50-00-0", input_type="rmlCas")
assert result is not False
assert "ToxSummary" in result
assert "AcuteToxicity" not in result
assert "RepeatedDose" not in result
@patch('pif_compiler.services.echa_find.requests.get')
def test_no_tox_sections_found(self, mock_get):
"""Test when no toxicology sections are found."""
mock_substance_response = Mock()
mock_substance_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": "Test Substance",
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
mock_dossier_response = Mock()
mock_dossier_response.json.return_value = {
"items": [{
"assetExternalId": "abc123",
"rootKey": "key123",
"lastUpdatedDate": "2024-01-15T10:30:00Z"
}]
}
mock_index_response = Mock()
mock_index_response.text = "<html><body>No toxicology sections</body></html>"
mock_get.side_effect = [
mock_substance_response,
mock_dossier_response,
mock_index_response
]
result = search_dossier("50-00-0", input_type="rmlCas")
assert result is not False
assert "ToxSummary" not in result
assert "AcuteToxicity" not in result
assert "RepeatedDose" not in result
# Basic info should still be present
assert "rmlId" in result
assert "index" in result
@patch('pif_compiler.services.echa_find.requests.get')
def test_js_links_created(self, mock_get):
"""Test that both HTML and JS links are created."""
mock_substance_response = Mock()
mock_substance_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": "Test Substance",
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
mock_dossier_response = Mock()
mock_dossier_response.json.return_value = {
"items": [{
"assetExternalId": "abc123",
"rootKey": "key123",
"lastUpdatedDate": "2024-01-15T10:30:00Z"
}]
}
mock_index_response = Mock()
mock_index_response.text = """
<html>
<div id="id_7_Toxicologicalinformation">
<a href="tox_summary_001"></a>
</div>
<div id="id_72_AcuteToxicity">
<a href="acute_tox_001"></a>
</div>
</html>
"""
mock_get.side_effect = [
mock_substance_response,
mock_dossier_response,
mock_index_response
]
result = search_dossier("50-00-0", input_type="rmlCas")
assert result is not False
assert "ToxSummary" in result
assert "ToxSummary_js" in result
assert "AcuteToxicity" in result
assert "AcuteToxicity_js" in result
assert "index" in result
assert "index_js" in result
class TestSearchDossierURLConstruction:
"""Test URL construction for various endpoints."""
@patch('pif_compiler.services.echa_find.requests.get')
def test_search_response_url(self, mock_get):
"""Test that search_response URL is correctly constructed."""
mock_substance_response = Mock()
mock_substance_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": "Test Substance",
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
mock_dossier_response = Mock()
mock_dossier_response.json.return_value = {
"items": [{
"assetExternalId": "abc123",
"rootKey": "key123",
"lastUpdatedDate": "2024-01-15T10:30:00Z"
}]
}
mock_index_response = Mock()
mock_index_response.text = "<html></html>"
mock_get.side_effect = [
mock_substance_response,
mock_dossier_response,
mock_index_response
]
result = search_dossier("50-00-0", input_type="rmlCas")
assert result is not False
assert "search_response" in result
assert "50-00-0" in result["search_response"]
assert "https://chem.echa.europa.eu/api-substance/v1/substance" in result["search_response"]
@patch('pif_compiler.services.echa_find.requests.get')
def test_url_encoding(self, mock_get):
"""Test that special characters in substance names are URL-encoded."""
substance_name = "test substance with spaces"
mock_substance_response = Mock()
mock_substance_response.json.return_value = {
"items": [{
"substanceIndex": {
"rmlId": "100.000.001",
"rmlName": substance_name,
"rmlCas": "50-00-0",
"rmlEc": "200-001-8"
}
}]
}
mock_dossier_response = Mock()
mock_dossier_response.json.return_value = {
"items": [{
"assetExternalId": "abc123",
"rootKey": "key123",
"lastUpdatedDate": "2024-01-15T10:30:00Z"
}]
}
mock_index_response = Mock()
mock_index_response.text = "<html></html>"
mock_get.side_effect = [
mock_substance_response,
mock_dossier_response,
mock_index_response
]
result = search_dossier(substance_name, input_type="rmlName")
assert result is not False
assert "search_response" in result
# Spaces should be encoded
assert "%20" in result["search_response"] or "+" in result["search_response"]
class TestIntegration:
"""Integration tests with real API (marked as integration)."""
@pytest.mark.integration
def test_real_formaldehyde_search(self):
"""Test real API call for formaldehyde (requires internet)."""
result = search_dossier("50-00-0", input_type="rmlCas")
if result and isinstance(result, dict):
# Real API call succeeded
assert "rmlId" in result
assert "rmlName" in result
assert "rmlCas" in result
assert result["rmlCas"] == "50-00-0"
assert "index" in result
assert "dossierType" in result
@pytest.mark.integration
def test_real_water_search(self):
"""Test real API call for water by CAS (requires internet)."""
result = search_dossier("7732-18-5", input_type="rmlCas")
if result and isinstance(result, dict):
assert "rmlCas" in result
assert result["rmlCas"] == "7732-18-5"
@pytest.mark.integration
def test_real_nonexistent_substance(self):
"""Test real API call for non-existent substance (requires internet)."""
result = search_dossier("999-99-9", input_type="rmlCas")
# Should return False for non-existent substance
assert result is False or isinstance(result, str)
@pytest.mark.integration
def test_real_glycerin_search(self):
"""Test real API call for glycerin (requires internet)."""
result = search_dossier("56-81-5", input_type="rmlCas")
if result and isinstance(result, dict):
assert "rmlCas" in result
assert result["rmlCas"] == "56-81-5"
assert "rmlId" in result
assert "dossierType" in result
@pytest.mark.integration
def test_real_niacinamide_search(self):
"""Test real API call for niacinamide (requires internet)."""
result = search_dossier("98-92-0", input_type="rmlCas")
if result and isinstance(result, dict):
assert "rmlCas" in result
assert result["rmlCas"] == "98-92-0"
@pytest.mark.integration
def test_real_retinol_search(self):
"""Test real API call for retinol (requires internet)."""
result = search_dossier("68-26-8", input_type="rmlCas")
if result and isinstance(result, dict):
assert "rmlCas" in result
assert result["rmlCas"] == "68-26-8"
@pytest.mark.integration
def test_real_caffeine_search(self):
"""Test real API call for caffeine (requires internet)."""
result = search_dossier("58-08-2", input_type="rmlCas")
if result and isinstance(result, dict):
assert "rmlCas" in result
assert result["rmlCas"] == "58-08-2"
@pytest.mark.integration
def test_real_salicylic_acid_search(self):
"""Test real API call for salicylic acid (requires internet)."""
result = search_dossier("69-72-7", input_type="rmlCas")
if result and isinstance(result, dict):
assert "rmlCas" in result
assert result["rmlCas"] == "69-72-7"
@pytest.mark.integration
def test_real_titanium_dioxide_search(self):
"""Test real API call for titanium dioxide (requires internet)."""
result = search_dossier("13463-67-7", input_type="rmlCas")
if result and isinstance(result, dict):
assert "rmlCas" in result
assert result["rmlCas"] == "13463-67-7"
@pytest.mark.integration
def test_real_zinc_oxide_search(self):
"""Test real API call for zinc oxide (requires internet)."""
result = search_dossier("1314-13-2", input_type="rmlCas")
if result and isinstance(result, dict):
assert "rmlCas" in result
assert result["rmlCas"] == "1314-13-2"
@pytest.mark.integration
def test_multiple_cosmetic_ingredients(self, sample_cas_numbers):
"""Test real API calls for multiple cosmetic ingredients (requires internet)."""
# Test a subset of common cosmetic ingredients
test_ingredients = [
("water", "7732-18-5"),
("glycerin", "56-81-5"),
("propylene_glycol", "57-55-6"),
]
for name, cas in test_ingredients:
result = search_dossier(cas, input_type="rmlCas")
if result and isinstance(result, dict):
assert result["rmlCas"] == cas
assert "rmlId" in result
# Give the API some time between requests
import time
time.sleep(0.5)