Source code for chemicals.identifiers

"""Chemical Engineering Design Library (ChEDL). Utilities for process modeling.
Copyright (C) 2016, 2017, 2018, 2019, 2020 Caleb Bell <Caleb.Andrew.Bell@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

This module contains a database of metadata on ~70000 chemicals from the PubChem
datase. It contains comprehensive feature for searching the metadata.
It also includes a small database of common mixture compositions.

For reporting bugs, adding feature requests, or submitting pull requests,
please use the `GitHub issue tracker <https://github.com/CalebBell/chemicals/>`_.

.. contents:: :local:

Search Functions
----------------
.. autofunction:: chemicals.identifiers.CAS_from_any
.. autofunction:: chemicals.identifiers.MW
.. autofunction:: chemicals.identifiers.search_chemical
.. autofunction:: chemicals.identifiers.IDs_to_CASs

CAS Number Utilities
--------------------
.. autofunction:: chemicals.identifiers.check_CAS
.. autofunction:: chemicals.identifiers.CAS_to_int
.. autofunction:: chemicals.identifiers.int_to_CAS
.. autofunction:: chemicals.identifiers.sorted_CAS_key

Database Objects
----------------
There is an object used to represent a chemical's metadata, an object used to
represent a common mixture's composition, and an object used to hold the
mixture metadata.

.. autoclass:: chemicals.identifiers.ChemicalMetadata
.. autoclass:: chemicals.identifiers.CommonMixtureMetadata
.. autoclass:: chemicals.identifiers.ChemicalMetadataDB
.. autofunction:: chemicals.identifiers.get_pubchem_db

Chemical Groups
---------------
It is convenient to tag some chemicals with labels like "refrigerant", or in
a certain database or not. The following chemical groups are available.

.. autodata:: chemicals.identifiers.cryogenics
.. autodata:: chemicals.identifiers.inerts
.. autofunction:: chemicals.identifiers.dippr_compounds
"""


__all__ = ['check_CAS', 'CAS_from_any', 'MW', 'search_chemical',
           'mixture_from_any', 'cryogenics', 'inerts', 'dippr_compounds', 'IDs_to_CASs',
           'get_pubchem_db', 'CAS_to_int', 'sorted_CAS_key', 'int_to_CAS']

import os

from chemicals.elements import charge_from_formula, homonuclear_elements_CASs_set, periodic_table, serialize_formula
from chemicals.utils import PY37, can_load_data, mark_numba_incompatible, os_path_join, source_path, to_num

folder = os_path_join(source_path, 'Identifiers')

[docs]@mark_numba_incompatible def check_CAS(CASRN): """Checks if a CAS number is valid. Returns False if the parser cannot parse the given string. Parameters ---------- CASRN : str A three-piece, dash-separated set of numbers Returns ------- result : bool Boolean value if CASRN was valid. If parsing fails, return False also. Notes ----- Check method is according to Chemical Abstract Society. However, no lookup to their service is performed; therefore, this function cannot detect false positives. Function also does not support additional separators, apart from '-'. CAS numbers up to the series 1 XXX XXX-XX-X are now being issued. A long can hold CAS numbers up to 2 147 483-64-7 Examples -------- >>> check_CAS('7732-18-5') True >>> check_CAS('77332-18-5') False """ try: if CASRN.count('-') != 2: return False if CASRN[-2] != '-' or CASRN[-5] != '-': return False check = CASRN[-1] # Don't store the int - it is not necessary and is slower productsum = 0 chars = CASRN.replace('-', '')[:-1] i = len(chars) for num in chars: productsum += i*int(num) i -= 1 return productsum % 10 == int(check) except: return False
[docs]@mark_numba_incompatible def CAS_to_int(i): r'''Converts CAS number of a compounds from a string to an int. This is helpful when storing large amounts of CAS numbers, as their strings take up more memory than their numerical representational. All CAS numbers fit into 64 bit ints. Parameters ---------- CASRN : str CASRN [-] Returns ------- CASRN : int CASRN [-] Notes ----- Accomplishes conversion by removing dashes only, and then converting to an int. An incorrect CAS number will change without exception. Examples -------- >>> CAS_to_int('7704-34-9') 7704349 ''' return int(i.replace('-', ''))
[docs]@mark_numba_incompatible def int_to_CAS(i): r'''Converts CAS number of a compounds from an int to an string. This is helpful when dealing with int CAS numbers. Parameters ---------- CASRN : int CASRN [-] Returns ------- CASRN : str CASRN [-] Notes ----- Handles CAS numbers with an unspecified number of digits. Does not work on floats. Examples -------- >>> int_to_CAS(7704349) '7704-34-9' ''' i = str(i) return i[:-3]+'-'+i[-3:-1]+'-'+i[-1]
[docs]@mark_numba_incompatible def sorted_CAS_key(CASs): r'''Takes a list of CAS numbers as strings, and returns a tuple of the same CAS numbers, sorted from smallest to largest. This is very convenient for obtaining a unique hash of a set of compounds, so as to see if two groups of compounds are the same. Parameters ---------- CASs : list[str] CAS numbers as strings [-] Returns ------- CASs_sorted : tuple[str] Sorted CAS numbers from lowest (first) to highest (last) [-] Notes ----- Does not check CAS numbers for validity. Examples -------- >>> sorted_CAS_key(['7732-18-5', '64-17-5', '108-88-3', '98-00-0']) ('64-17-5', '98-00-0', '108-88-3', '7732-18-5') ''' int_CASs = [CAS_to_int(i) for i in CASs] return tuple(CAS for _, CAS in sorted(zip(int_CASs, CASs)))
[docs]class ChemicalMetadata: """Class for storing metadata on chemicals. Attributes ---------- pubchemid : int Identification number on pubchem database; access their information online at https://pubchem.ncbi.nlm.nih.gov/compound/<pubchemid> [-] formula : str Formula of the compound; in the same format as :obj:`chemicals.elements.serialize_formula` generates, [-] MW : float Molecular weight of the compound as calculated with the standard atomic abundances; consistent with the element weights in :obj:`chemicals.elements.periodic_table`, [g/mol] smiles : str SMILES identification string, [-] InChI : str InChI identification string as given in pubchem (there can be multiple valid InChI strings for a compound), [-] InChI_key : str InChI key identification string (meant to be unique to a compound), [-] iupac_name : str IUPAC name as given in pubchem, [-] common_name : str Common name as given in pubchem, [-] synonyms : list[str] List of synonyms of the compound, [-] CAS : int CAS number of the compound; stored as an int for memory efficiency, [-] """ __slots__ = ('pubchemid', 'formula', 'MW', 'smiles', 'InChI', 'InChI_key', 'iupac_name', 'common_name', 'synonyms', 'CAS', '_charge') def __repr__(self): return (f'<ChemicalMetadata, name={self.common_name}, formula={self.formula}, smiles={self.smiles}, MW={self.MW:g}>') @property def charge(self): """Charge of the species as an integer. Computed as a property as most species do not have a charge and so storing it would be a waste of memory. """ try: return self._charge except AttributeError: self._charge = charge_from_formula(self.formula) return self._charge @property def CASs(self): """CAs number of the compound as a string. """ return int_to_CAS(self.CAS) def __init__(self, pubchemid, CAS, formula, MW, smiles, InChI, InChI_key, iupac_name, common_name, synonyms): self.pubchemid = pubchemid self.CAS = CAS self.formula = formula self.MW = MW self.smiles = smiles self.InChI = InChI self.InChI_key = InChI_key self.iupac_name = iupac_name self.common_name = common_name self.synonyms = synonyms
[docs]class ChemicalMetadataDB: '''Object which holds the main database of chemical metadata. .. warning:: To allow the `chemicals` to grow and improve, the details of this class may change in the future without notice! ''' loaded_main_db = False def __init__(self, elements=True, main_db=os_path_join(folder, 'chemical identifiers pubchem large.tsv'), user_dbs=[os_path_join(folder, 'chemical identifiers pubchem small.tsv'), os_path_join(folder, 'chemical identifiers example user db.tsv'), os_path_join(folder, 'Cation db.tsv'), os_path_join(folder, 'Anion db.tsv'), os_path_join(folder, 'Inorganic db.tsv')]): '''Construct the database from its parameters, loading all of the files in `user_dbs`, the periodic table, and defering loading of `main_db` as it is very large until a search doesn't find a chemical in the smaller database. ''' self.pubchem_index = {} self.smiles_index = {} self.InChI_index = {} self.InChI_key_index = {} self.name_index = {} self.CAS_index = {} self.formula_index = {} self.main_db = main_db self.user_dbs = user_dbs self.elements = elements for db in self.user_dbs: self.load(db) self.load_elements() def load_elements(self): '''Load elements into the indexes. ''' if not self.elements: return None InChI_key_index, CAS_index, pubchem_index = self.InChI_key_index, self.CAS_index, self.pubchem_index smiles_index, InChI_index, formula_index = self.smiles_index, self.InChI_index, self.formula_index name_index = self.name_index for ele in periodic_table: CAS = int(ele.CAS.replace('-', '')) # Store as int for easier lookup ele_lower_name = ele.name.lower() obj = ChemicalMetadata(pubchemid=ele.PubChem, CAS=CAS, formula=ele.symbol, MW=ele.MW, smiles=ele.smiles, InChI=ele.InChI, InChI_key=ele.InChI_key, iupac_name=ele_lower_name, common_name=ele_lower_name, synonyms=[ele_lower_name]) if obj.InChI_key in InChI_key_index: if ele.CAS not in homonuclear_elements_CASs_set: obj_old = InChI_key_index[obj.InChI_key] for name in obj_old.synonyms: name_index[name] = obj InChI_key_index[obj.InChI_key] = obj CAS_index[obj.CAS] = obj pubchem_index[obj.pubchemid] = obj smiles_index[obj.smiles] = obj InChI_index[obj.InChI] = obj if ele.CAS in homonuclear_elements_CASs_set: for name in obj.synonyms: name_index['monatomic ' + name] = obj else: for name in obj.synonyms: name_index[name] = obj formula_index[obj.formula] = obj def load(self, file_name): '''Load a particular file into the indexes. ''' f = open(file_name, encoding='utf-8') for line in f: # This is effectively the documentation for the file format of the file values = line.rstrip('\n').split('\t') (pubchemid, CAS, formula, MW, smiles, InChI, InChI_key, iupac_name, common_name) = values[0:9] CAS = int(CAS.replace('-', '')) # Store as int for easier lookup synonyms = values[7:] pubchemid = int(pubchemid) obj = ChemicalMetadata(pubchemid, CAS, formula, float(MW), smiles, InChI, InChI_key, iupac_name, common_name, synonyms) # Lookup indexes self.CAS_index[CAS] = obj self.pubchem_index[pubchemid] = obj self.smiles_index[smiles] = obj self.InChI_index[InChI] = obj self.InChI_key_index[InChI_key] = obj for name in synonyms: self.name_index[name] = obj self.formula_index[obj.formula] = obj f.close() def __iter__(self): if not self.finished_loading: self.autoload_main_db() return iter(i for i in self.InChI_key_index.values()) @property def finished_loading(self): '''Whether or not the database has loaded the main database. ''' return not (not self.loaded_main_db and self.main_db is not None) def finish_loading(self): '''Complete loading the main database, if it has not been fully loaded. ''' if not self.finished_loading: self.autoload_main_db() def autoload_main_db(self): '''Load the main database when needed. ''' self.load(self.main_db) for db in self.user_dbs: self.load(db) self.load_elements() self.loaded_main_db = True return True def _search_autoload(self, identifier, index, autoload=True): if index: if identifier in index: return index[identifier] else: if autoload and not self.finished_loading: self.autoload_main_db() return self._search_autoload(identifier, index, autoload) return False def search_pubchem(self, pubchem, autoload=True): '''Search for a chemical by its pubchem number. Accepts strings or ints. ''' return self._search_autoload(int(pubchem), self.pubchem_index, autoload=autoload) def search_CAS(self, CAS, autoload=True): '''Search for a chemical by its CAS number. Accepts strings or ints. ''' if type(CAS) != int: CAS = CAS_to_int(CAS) return self._search_autoload(CAS, self.CAS_index, autoload=autoload) def search_smiles(self, smiles, autoload=True): '''Search for a chemical by its smiles string. ''' return self._search_autoload(smiles, self.smiles_index, autoload=autoload) def search_InChI(self, InChI, autoload=True): '''Search for a chemical by its InChI string. ''' return self._search_autoload(InChI, self.InChI_index, autoload=autoload) def search_InChI_key(self, InChI_key, autoload=True): '''Search for a chemical by its InChI key. ''' return self._search_autoload(InChI_key, self.InChI_key_index, autoload=autoload) def search_name(self, name, autoload=True): '''Search for a chemical by its name. ''' return self._search_autoload(name, self.name_index, autoload=autoload) def search_formula(self, formula, autoload=True): '''Search for a chemical by its serialized formula. ''' return self._search_autoload(formula, self.formula_index, autoload=autoload)
[docs]@mark_numba_incompatible def CAS_from_any(ID, autoload=False, cache=True): """Wrapper around `search_chemical` which returns the CAS number of the found chemical directly. Parameters ---------- ID : str One of the name formats described by `search_chemical`, [-] autoload : bool, optional Whether to load new chemical databanks during the search if a hit is not immediately found, [-] cache : bool, optional Whether or not to cache the search for faster lookup in subsequent queries, [-] Returns ------- CASRN : str A three-piece, dash-separated set of numbers Notes ----- An exception is raised if the name cannot be identified. The PubChem database includes a wide variety of other synonyms, but these may not be present for all chemcials. See `search_chemical` for more details. Examples -------- >>> CAS_from_any('water') '7732-18-5' >>> CAS_from_any('InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3') '64-17-5' >>> CAS_from_any('CCCCCCCCCC') '124-18-5' >>> CAS_from_any('InChIKey=LFQSCWFLJHTTHZ-UHFFFAOYSA-N') '64-17-5' >>> CAS_from_any('pubchem=702') '64-17-5' >>> CAS_from_any('O') # only elements can be specified by symbol '17778-80-2' """ return search_chemical(ID, autoload=autoload, cache=cache).CASs
[docs]@mark_numba_incompatible def MW(ID, autoload=False, cache=True): """Wrapper around `search_chemical` which returns the molecular weight of the found chemical directly. Parameters ---------- ID : str One of the name formats described by `search_chemical` Returns ------- MW : float Molecular weight of chemical, [g/mol] Notes ----- An exception is raised if the name cannot be identified. The PubChem database includes a wide variety of other synonyms, but these may not be present for all chemcials. See `search_chemical` for more details. Examples -------- >>> MW('water') 18.01528 >>> MW('InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3') 46.06844 >>> MW('CCCCCCCCCC') 142.286 >>> MW('InChIKey=LFQSCWFLJHTTHZ-UHFFFAOYSA-N') 46.06844 >>> MW('pubchem=702') 46.06844 >>> MW('O') # only elements can be specified by symbol 15.9994 """ return search_chemical(ID, autoload=autoload, cache=cache).MW
chemical_search_cache = {} chemical_search_cache_max_size = 200
[docs]@mark_numba_incompatible def search_chemical(ID, autoload=False, cache=True): """Looks up metadata about a chemical by searching and testing for the input string being any of the following types of chemical identifiers: * Name, in IUPAC form or common form or a synonym registered in PubChem * InChI name, prefixed by 'InChI=1S/' or 'InChI=1/' * InChI key, prefixed by 'InChIKey=' * PubChem CID, prefixed by 'PubChem=' * SMILES (prefix with 'SMILES=' to ensure smiles parsing; ex. 'C' will return Carbon as it is an element whereas the SMILES interpretation for 'C' is methane) * CAS number (obsolete numbers may point to the current number) If the input is an ID representing an element, the following additional inputs may be specified as * Atomic symbol (ex 'Na') * Atomic number (as a string) Parameters ---------- ID : str One of the name formats described above autoload : bool, optional Whether to load new chemical databanks during the search if a hit is not immediately found, [-] cache : bool, optional Whether or not to cache the search for faster lookup in subsequent queries, [-] Returns ------- chemical_metadata : ChemicalMetadata A class containing attributes which describe the chemical's metadata, [-] Notes ----- An exception is raised if the name cannot be identified. The PubChem database includes a wide variety of other synonyms, but these may not be present for all chemcials. Examples -------- >>> search_chemical('water') <ChemicalMetadata, name=water, formula=H2O, smiles=O, MW=18.0153> >>> search_chemical('InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3') <ChemicalMetadata, name=ethanol, formula=C2H6O, smiles=CCO, MW=46.0684> >>> search_chemical('CCCCCCCCCC') <ChemicalMetadata, name=DECANE, formula=C10H22, smiles=CCCCCCCCCC, MW=142.286> >>> search_chemical('InChIKey=LFQSCWFLJHTTHZ-UHFFFAOYSA-N') <ChemicalMetadata, name=ethanol, formula=C2H6O, smiles=CCO, MW=46.0684> >>> search_chemical('pubchem=702') <ChemicalMetadata, name=ethanol, formula=C2H6O, smiles=CCO, MW=46.0684> >>> search_chemical('O') # only elements can be specified by symbol <ChemicalMetadata, name=oxygen, formula=O, smiles=[O], MW=15.9994> """ if cache and ID in chemical_search_cache: return chemical_search_cache[ID] if not _pubchem_db_loaded: get_pubchem_db() # pragma: no cover hit = _search_chemical(ID, autoload) if cache: if len(chemical_search_cache) > chemical_search_cache_max_size: # invalidate cache by time - first entry is removed relying on # dict ordering new in Python 3.7 chemical_search_cache.pop(next(chemical_search_cache.keys().__iter__())) chemical_search_cache[ID] = hit return hit
def _search_chemical(ID, autoload): ID_arg = ID ID = ID.strip() ID_lower = ID.lower() if ID in periodic_table: """Special handling for homonuclear elements. Search '1'> H, 'H'> H, monotomic CAS > H but "Hydrogen"> H2. pubchem_db does not contain atomic numbers, so searching in the periodic table is necessary. """ if (ID in periodic_table._symbol_to_elements or ID in periodic_table._number_to_elements or ID in periodic_table._CAS_to_elements): obj = pubchem_db.search_CAS(periodic_table[ID].CAS) else: obj = pubchem_db.search_CAS(periodic_table[ID].CAS_standard) return obj if check_CAS(ID): CAS_lookup = pubchem_db.search_CAS(ID, autoload) if CAS_lookup: return CAS_lookup # handle the case of synonyms CAS_alternate_loopup = pubchem_db.search_name(ID, autoload) if CAS_alternate_loopup: return CAS_alternate_loopup if not autoload: return search_chemical(ID, autoload=True) raise ValueError('A valid CAS number (%s) was recognized, but is not in the database' %(ID)) ID_len = len(ID) if ID_len > 9: inchi_search = False # normal upper case is 'InChI=1S/' if ID_lower[0:9] == 'inchi=1s/': inchi_search = ID[9:] elif ID_lower[0:8] == 'inchi=1/': inchi_search = ID[8:] if inchi_search: inchi_lookup = pubchem_db.search_InChI(inchi_search, autoload) if inchi_lookup: return inchi_lookup else: if not autoload: return search_chemical(ID, autoload=True) raise ValueError('A valid InChI name (%s) was recognized, but it is not in the database' %(inchi_search)) if ID_lower[0:9] == 'inchikey=': inchi_key_lookup = pubchem_db.search_InChI_key(ID[9:], autoload) if inchi_key_lookup: return inchi_key_lookup else: if not autoload: obj = search_chemical(ID, autoload=True) return obj raise ValueError('A valid InChI Key (%s) was recognized, but it is not in the database' %(inchi_key_lookup)) if ID_len > 8: if ID_lower[0:8] == 'pubchem=': pubchem_lookup = pubchem_db.search_pubchem(ID[8:], autoload) if pubchem_lookup: return pubchem_lookup else: if not autoload: return search_chemical(ID, autoload=True) raise ValueError('A PubChem integer (%s) identifier was recognized, but it is not in the database.' %(ID[8:])) if ID_len > 7: if ID_lower[0:7] == 'smiles=': smiles_lookup = pubchem_db.search_smiles(ID[7:], autoload) if smiles_lookup: return smiles_lookup else: if not autoload: return search_chemical(ID, autoload=True) raise ValueError('A SMILES identifier (%s) was recognized, but it is not in the database.' %(ID[7:])) # Try the smiles lookup anyway # Parsing SMILES is an option, but this is faster # Pybel API also prints messages to console on failure smiles_lookup = pubchem_db.search_smiles(ID, autoload) if smiles_lookup: return smiles_lookup try: formula_query = pubchem_db.search_formula(serialize_formula(ID), autoload) if formula_query and type(formula_query) == ChemicalMetadata: return formula_query except: pass # Try a direct lookup with the name - the fastest name_lookup = pubchem_db.search_name(ID, autoload) if name_lookup: return name_lookup # Permutate through various name options ID_no_space = ID.replace(' ', '') ID_no_space_dash = ID_no_space.replace('-', '') for name in [ID, ID_no_space, ID_no_space_dash]: for name2 in [name, name.lower()]: name_lookup = pubchem_db.search_name(name2, autoload) if name_lookup: return name_lookup if ID[-1] == ')' and '(' in ID:# # Try to match in the form 'water (H2O)' first_identifier, second_identifier = ID[0:-1].split('(', 1) try: CAS1 = search_chemical(first_identifier, autoload) CAS2 = search_chemical(second_identifier, autoload) if CAS1 == CAS2: CAS = CAS1 return CAS except: pass if not autoload: return _search_chemical(ID, autoload=True) raise ValueError('Chemical name (%s) not recognized' %(ID)) ### DIPPR Database, chemical list only # Obtained via the command: # list(pd.read_excel('http://www.aiche.org/sites/default/files/docs/pages/sponsor_compound_list-2014.xlsx')['Unnamed: 2'])[2:] # This is consistently faster than creating a list and then making the set.
[docs]@mark_numba_incompatible def dippr_compounds(): """Loads and returns a set of compounds known in the DIPPR database. This can be useful for knowing if a chemical is of industrial relevance. Returns ------- dippr_compounds : set([str]) A set of CAS numbers from the 2014 edition of the DIPPR database. """ dippr_compounds = set() with open(os.path.join(folder, 'dippr_2014.csv')) as f: dippr_compounds.update(f.read().split('\n')) return dippr_compounds
[docs]class CommonMixtureMetadata: """Class for storing metadata on predefined chemical mixtures. Attributes ---------- name : str Name of the mixture, [-] source : str Source of the mixture composition, [-] N : int Number of chemicals in the mixture, [-] CASs : list[str] CAS numbers of the mixture, [-] ws : list[float] Mass fractions of chemicals in the mixture, [-] zs : list[float] Mole fractions of chemicals in the mixture, [-] names : list[str] List of names of the chemicals in the mixture, [-] synonyms : list[str] List of synonyms of the mixture which can also be used to look it up, [-] """ __slots__ = ['name', 'CASs', 'N', 'source', 'names', 'ws', 'zs', 'synonyms'] def __repr__(self): return (f'<MixtureMetadata, name={self.name}, N={self.N}, CASs={self.CASs}, ws={self.ws}, zs={self.zs}>') def __init__(self, name, CASs, N, source, names, ws, zs, synonyms): self.name = name self.CASs = CASs self.N = N self.source = source self.names = names self.ws = ws self.zs = zs self.synonyms = synonyms
@mark_numba_incompatible def mixture_from_any(ID): """Search by string for a mixture in the included common mixture database. The database primarily contains refrigerant blends. The variable `common_mixtures` contains all loaded entries. Parameters ---------- ID : list[str] or str A string or 1-element list containing the name which may represent a mixture. Returns ------- mixture : CommonMixtureMetadata Object containing basic mixture information Notes ----- White space, '-', and upper case letters are removed in the search. Examples -------- >>> mixture_from_any('R512A') <MixtureMetadata, name=R512A, N=2, CASs=['811-97-2', '75-37-6'], ws=[0.05, 0.95], zs=[0.032949, 0.96705]> >>> mixture_from_any(['air']) <MixtureMetadata, name=Air, N=3, CASs=['7727-37-9', '7440-37-1', '7782-44-7'], ws=[0.7557, 0.0127, 0.2316], zs=[0.7812, 0.0092, 0.2096]> """ if not mixture_composition_loaded: # pragma: no cover load_mixture_composition() if type(ID) == list: if len(ID) == 1: ID = ID[0] else: raise ValueError('If the input is a list, the list must contain only one item.') ID = ID.lower().strip() for i in (ID, ID.replace(' ', ''), ID.replace('-', '')): try: return common_mixtures_by_synonym[i] except KeyError: pass raise ValueError('Mixture name not recognized')
[docs]@mark_numba_incompatible def IDs_to_CASs(IDs): """Find the CAS numbers for multiple chemicals names at once. Also supports having a string input which is a common mixture name in the database. An error will be raised if any of the chemicals cannot be found. Parameters ---------- IDs : list[str] or str A string or 1-element list containing the name which may represent a mixture. Returns ------- CASs : list[str] CAS numbers of found chemicals, [-] Notes ----- White space, '-', and upper case letters are removed in the search. Examples -------- >>> IDs_to_CASs('R512A') ['811-97-2', '75-37-6'] >>> IDs_to_CASs(['norflurane', '1,1-difluoroethane']) ['811-97-2', '75-37-6'] """ if hasattr(IDs, 'strip') or (isinstance(IDs, list) and len(IDs) == 1): try: # Assume the name was a pre-defined mixture mixname = mixture_from_any(IDs) return mixname.CASs except: if hasattr(IDs, 'strip'): # It it one chemical? return [CAS_from_any(IDs)] return [CAS_from_any(ID) for ID in IDs]
cryogenics = {'132259-10-0': 'Air', '7440-37-1': 'Argon', '630-08-0': 'carbon monoxide', '7782-39-0': 'deuterium', '7782-41-4': 'fluorine', '7440-59-7': 'helium', '1333-74-0': 'hydrogen', '7439-90-9': 'krypton', '74-82-8': 'methane', '7440-01-9': 'neon', '7727-37-9': 'nitrogen', '7782-44-7': 'oxygen', '7440-63-3': 'xenon'} inerts = {"7440-37-1": "Argon", "124-38-9": "Carbon Dioxide", "7440-59-7": "Helium", "7440-01-9": "Neon", "7727-37-9": "Nitrogen", "7440-63-3": "Xenon", "10102-43-9": "Nitric Oxide", "10102-44-0": "Nitrogen Dioxide", "7782-44-7": "Oxygen", "132259-10-0": "Air", "7439-90-9": "krypton", "10043-92-2": "radon", "7732-18-5": "water", "7782-50-5": "chlorine", "7782-41-4": "fluorine"} _pubchem_db_loaded = False
[docs]@mark_numba_incompatible def get_pubchem_db(): """Helper function to delay the creation of the pubchem_db object. This avoids loading the database when it is not needed. """ global _pubchem_db_loaded, pubchem_db if _pubchem_db_loaded: # pragma: no cover return pubchem_db else: pubchem_db = ChemicalMetadataDB() _pubchem_db_loaded = True return pubchem_db
mixture_composition_loaded = False global common_mixtures_by_synonym, common_mixtures @mark_numba_incompatible def load_mixture_composition(): global mixture_composition_loaded, common_mixtures_by_synonym, common_mixtures common_mixtures = {} common_mixtures_by_synonym = {} with open(os.path.join(folder, 'Mixtures Compositions.tsv')) as f: """Read in a dict of 90 or so mixutres, their components, and synonyms. Small errors in mole fractions not adding to 1 are known. Errors in adding mass fraction are less common, present at the 5th decimal. Mass basis is assumed for all mixtures. """ next(f) for line in f: values = to_num(line.strip('\n').strip('\t').split('\t')) name, source, N = values[0:3] N = int(N) CASs, names, ws, zs = values[3:3+N], values[3+N:3+2*N], values[3+2*N:3+3*N], values[3+3*N:3+4*N] synonyms = values[3+4*N:] if synonyms: synonyms = [i.lower() for i in synonyms] synonyms.append(name.lower()) obj = CommonMixtureMetadata(name=name, CASs=CASs, N=N, source=source, names=names, ws=ws, zs=zs, synonyms=synonyms) common_mixtures[name] = obj for syn in synonyms: common_mixtures_by_synonym[syn] = obj mixture_composition_loaded = True if PY37: def __getattr__(name): if name == 'pubchem_db': return get_pubchem_db() elif name == 'common_mixtures' or name == 'common_mixtures_by_synonym': load_mixture_composition() return globals()[name] raise AttributeError(f"module {__name__} has no attribute {name}") # pragma: no cover else: # pragma: no cover if can_load_data: get_pubchem_db() load_mixture_composition()