Add more structured syntax

This commit is contained in:
E Dunbar 2025-05-13 16:17:42 -05:00
parent d330f85c67
commit 66026fe74b
11 changed files with 18956 additions and 4528 deletions

View file

@ -0,0 +1,266 @@
#!/usr/bin/env hy
(import xml.etree.ElementTree :as ET)
(require hyrule [-> doto meth ncut])
(import catboost :as cb)
(import numpy :as np)
(import pandas :as pd)
(import rdkit [Chem RDLogger])
(import tqdm [tqdm])
(import maplight-gnn)
(defclass DrugBank []
(setv namespaces {"" "http://www.drugbank.ca"})
(defmacro ap-find [element name if-found]
`(do
(setv it (.find ~element ~name self.namespaces))
(if-let it ~if-found)))
(defmacro if-let [maybe execute]
`(when (is-not ~maybe None)
~execute))
(meth __init__ [@filename @ids @id-types names]
(setv @names (.str.lower names))
(setv @get-ids {"ChEBI" @chebi
"ChEMBL" @chembl
"drugbank-id" @drugbank
"InChIKey" @inchikey
"PubChem Compound" @pubchem-compound
"PubChem Substance" @pubchem-substance
"unii" @unii}))
(meth get-matches []
(for [#(_ element) (tqdm (ET.iterparse @filename ["end"]))]
;; don't care about non-drug entries
(when (!= (cut element.tag 24 None) "drug")
(continue))
(setv matches (@check-match element))
;; make sure there are matches before doing more work
(when (not (matches.any))
(continue))
(yield #(matches element))))
(meth check-match [element]
(setv matches (pd.Series False :index @ids.index))
(for [#(id-type id-func) (.items @get-ids)]
(setv id-val (id-func element))
(when (is id-val None) (continue))
(setv id-matches (& (= @id-types id-type) (= @ids id-val)))
(setv matches (| matches id-matches)))
;; names can't use the same logic as the other id types
(setv #(generic-names brand-names) (@all-names element))
(setv matches (| matches (@names.isin generic-names)))
(setv matches (| matches (@names.isin brand-names)))
(return matches))
(meth all-names [element]
(setv generic-names (set))
(setv brand-names (set))
(setv main-name (@name element))
(when (is-not main-name None) (generic-names.add (.lower main-name)))
(ap-find element "synonyms"
(for [synonym (.iter it)]
(when (and (is-not synonym None) (is-not synonym.text None))
(generic-names.add (.lower synonym.text)))))
(ap-find element "products"
(for [product (.iter it)]
(setv brand-name (product.find "name" @namespaces))
(if-let brand-name (brand-names.add (.lower brand-name.text)))))
(setv generic-names (tuple (filter (fn [s] (not-in "\n" s)) generic-names)))
(setv brand-names (tuple (filter (fn [s] (not-in "\n" s)) brand-names)))
(return #(generic-names brand-names)))
(meth cas-number [element]
(ap-find element "cas-number" it.text))
(meth chebi [element]
(@from-external-identifiers element "ChEBI"))
(meth chembl [element]
(@from-external-identifiers element "ChEMBL"))
(meth drugbank [element]
(ap-find element "drugbank-id" it.text))
(meth fda-approval [element]
(ap-find element "groups" (in "approved" (tuple (it.itertext)))))
(meth inchikey [element]
(@from-calculated-properties element "InChIKey"))
(meth indication [element]
(ap-find element "indication" it.text))
(meth mechanism [element]
(ap-find element "mechanism-of-action" it.text))
(meth name [element]
(ap-find element "name" it.text))
(meth prices [element]
(ap-find element "prices"
(do
(setv prices (list))
(for [price-element (it.iterfind "price" @namespaces)]
(setv price (price-element.find "cost" @namespaces))
(if-let price (.append prices (+ price.text (price.attrib.get "currency")))))
(return prices))))
(meth pubchem-compound [element]
(@from-external-identifiers element "PubChem Compound"))
(meth pubchem-substance [element]
(@from-external-identifiers element "PubChem Substance"))
(meth smiles [element]
(@from-calculated-properties element "SMILES"))
(meth unii [element]
(ap-find element "unii" it.text))
(meth from-external-identifiers [element resource-type]
(ap-find element "external-identifiers"
(for [external-identifier (it.iterfind "external-identifier" @namespaces)]
(when (= (external-identifier.findtext "resource" :namespaces @namespaces) resource-type)
(return (external-identifier.findtext "identifier" :namespaces @namespaces))))))
(meth from-calculated-properties [element kind-type]
(ap-find element "calculated-properties"
(for [property (it.iterfind "property" @namespaces)]
(when (= (property.findtext "kind" :namespaces @namespaces) kind-type)
(return (property.findtext "value" :namespaces @namespaces)))))))
(defclass DataAugmenter []
(defmacro create-var-column [var-name col-name col-initial-value]
`(do
(setv ~var-name ~col-name)
(setv (get self.drug-list ~var-name) ~col-initial-value)))
(meth __init__ [@filename]
(setv @drug-list None)
(setv @admet-models None))
(meth load-drug-queries []
(cond
(@filename.endswith ".csv")
(with [f (open @filename "r")]
(setv @drug-list (pd.read-csv f)))
(@filename.endswith ".json")
(with [f (open @filename "r")]
(setv @drug-list (pd.read-json f :orient "records")))
True
(raise (ValueError "Data file must be .csv or .json")))
(return self))
(meth load-admet-models [models]
(setv @admet-models (dict))
(for [#(name path) (models.items)]
(setv model (cb.CatBoostClassifier))
(model.load-model path)
(setv (get @admet-models name) model))
(return self))
(meth save-drug-info [filename]
(when (is @drug-list None)
(raise (ValueError "drug-list must be loaded first.")))
(with [f (open filename "w")]
(@drug-list.to-json f :orient "records")))
(meth match-drugbank [filename id-col-name id-type-col-name name-col-name]
(when (is @drug-list None)
(raise (ValueError "drug-list is not defined. Call load-drug-queries before match-drugbank.")))
;; make sure the cols are strings and not lists of strings
(setv unwrap-list (fn [x] (if (isinstance x list) (get x 0) x)))
(setv id-col (.apply (get @drug-list id-col-name) unwrap-list))
(setv id-type-col (.apply (get @drug-list id-type-col-name) unwrap-list))
(setv name-col (.apply (get @drug-list name-col-name) unwrap-list))
;; tedious column making for what we're about to store
;; variable name, column title, initial value
(create-var-column cas-column "CAS Registry Number" None)
(create-var-column fda-column "FDA Approved" None)
(create-var-column indication-column "Indication" None)
(create-var-column mechanism-column "Mechanism" None)
(create-var-column name-column "DrugBank Name" None)
(create-var-column price-column "Prices" (@drug-list.apply (fn [_] (list)) :axis 1))
(create-var-column smiles-column "SMILES" None)
(create-var-column unii-column "UNII" None)
(setv drugbank (DrugBank filename id-col id-type-col name-col))
(for [#(matches element) (drugbank.get-matches)]
(setv (ncut @drug-list.loc matches cas-column) (drugbank.cas-number element))
(setv (ncut @drug-list.loc matches fda-column) (drugbank.fda-approval element))
(setv (ncut @drug-list.loc matches indication-column) (drugbank.indication element))
(setv (ncut @drug-list.loc matches mechanism-column) (drugbank.mechanism element))
(setv (ncut @drug-list.loc matches name-column) (drugbank.name element))
(setv (ncut @drug-list.loc matches price-column)
(.apply (ncut @drug-list.loc matches price-column) (fn [_] (drugbank.prices element)))) ; prices is a list
(setv (ncut @drug-list.loc matches smiles-column) (drugbank.smiles element))
(setv (ncut @drug-list.loc matches unii-column) (drugbank.unii element))))
(meth deduplicate []
(when (is @drug-list None)
(raise (ValueError "drug-list is not defined. Call load-drug-queries before deduplicate.")))
(when (not-in "DrugBank Name" @drug-list.columns)
(raise (ValueError "ID data does not exist yet. Run match-drugbank to create it.")))
(setv @drug-list
(-> @drug-list
(.groupby "DrugBank Name")
(.agg
(fn [x]
(setv y [])
(for [item x]
(if (isinstance item list)
(y.extend item)
(y.append item)))
(setv z (set y))
(z.discard None)
(cond
(= (len z) 0) None
(= (len z) 1) (.pop z)
True z)))
(.reset-index))))
(meth predict-admet []
(when (is @drug-list None)
(raise (ValueError "drug-list is not defined. Call load-drug-queries before predict-admet.")))
(when (is @admet-models None)
(raise (ValueError "admet-models is not defined. Call load-admet-models before predict-admet.")))
(when (not-in "SMILES" @drug-list.columns)
(raise (ValueError "SMILES data does not exist yet. Run match-drugbank to create it.")))
(RDLogger.DisableLog "rdApp.*")
(setv smiles-mask (.notna (get @drug-list "SMILES")))
(setv smiles (ncut @drug-list.loc smiles-mask "SMILES"))
(setv molecules (smiles.apply Chem.MolFromSmiles))
(setv molecules-mask (.notna molecules))
(setv fingerprints (@get-fingerprints (get molecules molecules-mask)))
(setv combined-mask (pd.Series False :index @drug-list.index))
(setv (ncut combined-mask.loc (. (get smiles molecules-mask) index)) True)
(for [#(name model) (@admet-models.items)]
(setv predictions (model.predict-proba fingerprints))
(setv (ncut @drug-list.loc combined-mask name) (ncut predictions : 1))))
(meth get-fingerprints [molecules]
(setv fingerprints (list))
(fingerprints.append (maplight-gnn.get-morgan-fingerprints molecules))
(fingerprints.append (maplight-gnn.get-avalon-fingerprints molecules))
(fingerprints.append (maplight-gnn.get-erg-fingerprints molecules))
(fingerprints.append (maplight-gnn.get-rdkit-features molecules))
(fingerprints.append (maplight-gnn.get-gin-supervised-masking molecules))
(np.concatenate fingerprints :axis 1)))
(when (= __name__ "__main__")
(setv augmenter
(-> (DataAugmenter "data/translator_drugs.json")
(.load-drug-queries)
(.load-admet-models {"Blood Brain Barrier" "data/admet/bbb_martins-0.916-0.002.dump" "Bioavailability" "data/admet/bioavailability_ma-0.74-0.01.dump" "Human Intestinal Absorption" "data/admet/hia_hou-0.989-0.001.dump"})))
(doto augmenter
(.match-drugbank "data/src/drugbank.xml" "result_id" "id_type" "result_name")
(.deduplicate)
(.predict-admet)
(.save-drug-info "data/translator_drug_list.json")))