Add more structured syntax

This commit is contained in:
E Dunbar 2025-05-13 16:17:42 -05:00
parent d330f85c67
commit 66026fe74b
11 changed files with 18956 additions and 4528 deletions

View file

@ -9,9 +9,8 @@
const regexp = { const regexp = {
ascii_whitespace: /[\u0009\u000A\u000B\u000C\u000D\u0020]/, ascii_whitespace: /[\u0009\u000A\u000B\u000C\u000D\u0020]/,
symbol_seq: /[^()\[\]{};"'`~:.\d\u0009\u000A\u000B\u000C\u000D\u0020][^()\[\]{};"'`~.\u0009\u000A\u000B\u000C\u000D\u0020]*/
} }
const symbol_seq_immediate = token.immediate(regexp.symbol_seq) const plus_minus = choice('+', '-')
const digitpart = seq(/\d/, repeat(/_*\d+/)) const digitpart = seq(/\d/, repeat(/_*\d+/))
const pointfloat = choice( const pointfloat = choice(
seq(optional(digitpart), '.', digitpart), seq(optional(digitpart), '.', digitpart),
@ -19,7 +18,7 @@ const pointfloat = choice(
) )
const exponentfloat = seq( const exponentfloat = seq(
choice(digitpart, pointfloat), choice(digitpart, pointfloat),
seq(/[eE]/, optional(/[+-]/), digitpart), seq(choice('e', 'E'), optional(plus_minus), digitpart),
) )
module.exports = grammar({ module.exports = grammar({
@ -30,50 +29,65 @@ module.exports = grammar({
$.comment, $.comment,
], ],
// word: $ => $.symbol, word: $ => $.symbol,
rules: { rules: {
// SYNTACTIC ELEMENTS
source_file: $ => seq(optional($.shebang), repeat($._element)), source_file: $ => seq(optional($.shebang), repeat($._element)),
shebang: _ => token(seq('#!', /.*/)), shebang: _ => token(seq('#!', /.*/)),
_element: $ => choice($._form, $.discard, $.comment), _element: $ => choice(
$._form,
$.discard,
$.comment,
$.import,
$.require,
$.function,
$.lambda,
$.class,
$.macro,
$.reader,
),
_form: $ => seq(optional($._sugar), choice($._identifier, $._sequence, $._string)), _form: $ => seq(optional($.sugar), choice($._identifier, $._sequence, $._string)),
discard: $ => seq('#_', $._form), discard: $ => seq('#_', $._form),
comment: _ => token(seq(';', /.*/)), comment: _ => token(seq(';', /.*/)),
_sugar: _ => choice( sugar: _ => choice(
field('quote', '\''), '\'',
field('quasiquote', '`'), '`',
field('unqoute', '~'), '~',
field('unqoute_splice', '~@'), '~@',
field('unpack_iterable', '#*'), '#*',
field('unpack_mapping', '#**'), '#**',
), ),
_identifier: $ => choice( _identifier: $ => choice(
$._numeric_literal, $._numeric_literal,
$.keyword, $.keyword,
$.symbol, $._symbol_or_dots,
$.dotted_identifier, $.dotted_identifier,
), ),
_sequence: $ => choice($.expression, $.list, $.tuple, $.set, $.dictionary), _sequence: $ => choice($.expression, $.list, $.tuple, $.set, $.dictionary),
_string: $ => choice($.string, $.bracket_string), _string: $ => choice($.string, $.bracket_string),
_numeric_literal: $ => choice($.integer, $.float, $.complex), _numeric_literal: $ => choice($.integer, $.float, $.complex),
keyword: _ => token(seq(':', optional(regexp.symbol_seq))), keyword: $ => prec.right(seq(
dotted_identifier: _ => prec(1, choice( ':',
optional($.immediate_symbol),
)),
_symbol_or_dots: $ => choice(
$.symbol,
$.dots,
),
dotted_identifier: $ => choice(
seq( seq(
/[.]+/, /[.]+/,
symbol_seq_immediate, $.immediate_symbol,
repeat(seq(token.immediate('.'), symbol_seq_immediate)), repeat(seq(token.immediate('.'), $.immediate_symbol)),
), ),
seq( seq(
regexp.symbol_seq, field("sym", $.symbol),
repeat1((seq(token.immediate('.'), symbol_seq_immediate)))), repeat1(seq(token.immediate('.'), $.immediate_symbol))),
)),
symbol: _ => choice(
/[.]+/,
regexp.symbol_seq,
), ),
expression: $ => seq('(', repeat1($._element), ')'), expression: $ => seq('(', repeat1($._element), ')'),
@ -91,32 +105,219 @@ module.exports = grammar({
'}' '}'
), ),
string: _ => token(seq( string: _ => seq(
/[rbf]{0,3}/, /[rbf]*"/,
'"', field("content", /[^"]*/),
/[^"]*/,
'"' '"'
)), ),
bracket_string: _ => token(seq('#[[', /[^\]]*/, ']]')), bracket_string: _ => seq(
'#[[',
field("content", /[^\]]*/),
']]'
),
integer: $ => choice($._decinteger, $._bininteger, $._octinteger, $._hexinteger), integer: $ => choice($._decinteger, $._bininteger, $._octinteger, $._hexinteger),
float: _ => token(prec(1, seq( float: _ => token(prec(1, seq(
optional(/[+-]/), optional(plus_minus),
choice(pointfloat, exponentfloat, 'Inf', 'NaN'), choice(pointfloat, exponentfloat, 'Inf', 'NaN'),
))), ))),
complex: _ => token(prec(1, seq( complex: _ => token(prec(1, seq(
optional(/[+-]/), optional(plus_minus),
choice(pointfloat, exponentfloat, digitpart, 'Inf', 'NaN'), choice(pointfloat, exponentfloat, digitpart, 'Inf', 'NaN'),
/[+-]/, plus_minus,
seq( seq(
choice(pointfloat, exponentfloat, digitpart, 'NaN', 'Inf'), choice(pointfloat, exponentfloat, digitpart, 'NaN', 'Inf'),
/[jJ]/, choice('j', 'J'),
), ),
))), ))),
_decinteger: _ => token(prec(1, seq(optional(/[+-]/), /\d/, repeat(/[,_]*\d+/)))), symbol: _ => token(seq(
/[^()\[\]{};"'`~:.\d\u0009\u000A\u000B\u000C\u000D\u0020]/,
repeat(/[^()\[\]{};"'`~.\u0009\u000A\u000B\u000C\u000D\u0020]/),
)),
immediate_symbol: _ => token.immediate(seq(
/[^()\[\]{};"'`~:.\d\u0009\u000A\u000B\u000C\u000D\u0020]/,
repeat(/[^()\[\]{};"'`~.\u0009\u000A\u000B\u000C\u000D\u0020]/),
)),
dots: _ => /[.]+/,
_decinteger: _ => token(prec(1, seq(optional(plus_minus), /\d/, repeat(/[,_]*\d+/)))),
_bininteger: _ => token(prec(1, seq('0', /[bB]/, repeat(/[,_]*[01]+/)))), _bininteger: _ => token(prec(1, seq('0', /[bB]/, repeat(/[,_]*[01]+/)))),
_octinteger: _ => token(prec(1, seq('0', /[oO]/, repeat(/[,_]*[0-7]+/)))), _octinteger: _ => token(prec(1, seq('0', /[oO]/, repeat(/[,_]*[0-7]+/)))),
_hexinteger: _ => token(prec(1, seq('0', /[xX]/, repeat(/[,_]*[\da-fA-F]+/)))), _hexinteger: _ => token(prec(1, seq('0', /[xX]/, repeat(/[,_]*[\da-fA-F]+/)))),
// STRUCTURED SYNTAX
import: $ => seq(
'(',
'import',
repeat1(
choice(
$.module_import,
$.named_import,
),
),
')'
),
require: $ => seq(
'(',
'require',
repeat1(
choice(
$.module_import,
$.named_import,
$.namespace_require,
),
),
')'
),
function: $ => seq(
'(',
'defn',
optional(
seq(
':',
token.immediate('async'),
)
),
field('decorators', optional($.variable_list)),
optional($.type_parameters),
optional($.type_annotation),
field('name', $.symbol),
$.parameter_list,
repeat($._element),
')',
),
lambda: $ => seq(
'(',
'fn',
optional(
seq(
':',
token.immediate('async'),
)
),
$.parameter_list,
repeat($._element),
')',
),
class: $ => seq(
'(',
'defclass',
field('decorators', optional($.variable_list)),
optional($.type_parameters),
field('name', $.symbol),
field('superclasses', $.variable_list),
repeat($._element),
')',
),
macro: $ => seq(
'(',
'defmacro',
field('name', $.symbol),
$.parameter_list,
repeat($._element),
')',
),
reader: $ => seq(
'(',
'defreader',
field('name', $.symbol),
repeat($._element),
')',
),
module_import: $ => seq(
choice(
seq($._variable, optional('*')),
$.aliased_import,
),
),
named_import: $ => seq(
$._variable,
seq(
'[',
repeat1(
choice(
$.symbol,
$.aliased_import,
),
),
']',
),
),
namespace_require: $ => seq(
$._variable,
choice(
repeat1(
seq(
':',
choice(
token.immediate('macros'),
token.immediate('readers'),
),
'[',
repeat1(
choice(
$.symbol,
$.aliased_import,
),
),
']',
),
),
seq(
$.keyword,
'*',
),
),
),
variable_list: $ => seq(
'[',
repeat1($._variable),
']',
),
type_parameters: $ => seq(
':',
token.immediate('tp'),
'[',
repeat1($._variable),
']',
),
type_annotation: $ => seq(
'#^',
field('type', $._variable),
),
parameter_list: $ => seq(
'[',
repeat(
choice(
$.symbol,
seq(
'[',
$.symbol,
$._form,
']',
),
'/',
'*',
'#*',
'#**',
),
),
']',
),
_variable: $ => choice(
$.symbol,
$.dotted_identifier,
),
aliased_import: $ => seq(
$._variable,
':',
token.immediate('as'),
$.symbol,
),
} }
}); });

View file

@ -1,16 +1,28 @@
; Variables ; Variables
(symbol) @variable [
(symbol)
(immediate_symbol)
] @variable
(keyword) @property (keyword) @property
; Symbol naming conventions ; Symbol naming conventions
((symbol) @type ([
(symbol)
(immediate_symbol)
] @type
(#lua-match? @type "^[A-Z].*[a-z]")) (#lua-match? @type "^[A-Z].*[a-z]"))
((symbol) @constant ([
(symbol)
(immediate_symbol)
] @constant
(#lua-match? @constant "^[A-Z][A-Z0-9_-]*$")) (#lua-match? @constant "^[A-Z][A-Z0-9_-]*$"))
((symbol) @constant.builtin ([
(symbol)
(immediate_symbol)
] @constant.builtin
(#lua-match? @constant.builtin "^__[a-zA-Z0-9_-]*__$")) (#lua-match? @constant.builtin "^__[a-zA-Z0-9_-]*__$"))
((symbol) @constant.builtin ((symbol) @constant.builtin
@ -186,3 +198,6 @@
(dotted_identifier (dotted_identifier
"." @punctuation.delimiter) "." @punctuation.delimiter)
(keyword
":" @punctuation.delimiter)

1178
src/grammar.json generated

File diff suppressed because it is too large Load diff

2119
src/node-types.json generated

File diff suppressed because it is too large Load diff

18927
src/parser.c generated

File diff suppressed because it is too large Load diff

View file

@ -8,9 +8,9 @@ keyword
--- ---
(source_file (source_file
(keyword) (keyword (immediate_symbol))
(expression (expression
(symbol) (keyword) (integer))) (symbol) (keyword (immediate_symbol)) (integer)))
================== ==================
dotted identifiers dotted identifiers
@ -25,12 +25,12 @@ dotted identifiers
(source_file (source_file
(expression (expression
(dotted_identifier)) (dotted_identifier (symbol) (immediate_symbol) (immediate_symbol)))
(expression (expression
(dotted_identifier)) (dotted_identifier (immediate_symbol) (immediate_symbol)))
(expression (expression
(dotted_identifier)) (dotted_identifier (immediate_symbol) (immediate_symbol)))
(dotted_identifier)) (dotted_identifier (immediate_symbol) (immediate_symbol)))
====================== ======================
not dotted identifiers not dotted identifiers
@ -40,11 +40,14 @@ not dotted identifiers
... ...
........ ........
. . . .
(. foo bar)
--- ---
(source_file (source_file
(symbol) (dots)
(symbol) (dots)
(symbol) (dots)
(symbol) (symbol)) (dots) (dots)
(expression
(dots) (symbol) (symbol)))

View file

@ -7,8 +7,9 @@ expression
--- ---
(source_file (source_file
(expression (expression
(symbol) (symbol))) (symbol)
(symbol)))
==== ====
list list
@ -20,9 +21,10 @@ list
--- ---
(source_file (source_file
(list (list
(symbol) (symbol)) (symbol)
(list)) (symbol))
(list))
===== =====
tuple tuple
@ -34,9 +36,10 @@ tuple
--- ---
(source_file (source_file
(tuple (tuple
(symbol) (symbol)) (symbol)
(tuple)) (symbol))
(tuple))
=== ===
set set
@ -48,9 +51,10 @@ set
--- ---
(source_file (source_file
(set (set
(symbol) (symbol)) (symbol)
(set)) (symbol))
(set))
========== ==========
dictionary dictionary
@ -62,9 +66,10 @@ dictionary
--- ---
(source_file (source_file
(dictionary (dictionary
(symbol) (symbol)) (symbol)
(dictionary)) (symbol))
(dictionary))
=================== ===================
function definition function definition
@ -76,7 +81,9 @@ function definition
--- ---
(source_file (source_file
(function
(symbol)
(parameter_list)
(expression (expression
(symbol) (symbol) (list) (symbol)
(expression (string))))
(symbol) (string))))

287
test/corpus/structured.txt Normal file
View file

@ -0,0 +1,287 @@
======
import
======
(import sys os.path)
(import os.path [exists isdir :as is-dir isfile])
(import sys :as systest)
(import sys *)
(import tests.resources [kwtest function-with-a-dash]
os.path [exists
isdir :as is-dir
isfile :as is-file]
sys :as systest
math *)
---
(source_file
(import
(module_import
(symbol))
(module_import
(dotted_identifier
(symbol)
(immediate_symbol))))
(import
(named_import
(dotted_identifier
(symbol)
(immediate_symbol))
(symbol)
(aliased_import
(symbol)
(symbol))
(symbol)))
(import
(module_import
(aliased_import
(symbol)
(symbol))))
(import
(module_import
(symbol)))
(import
(named_import
(dotted_identifier
(symbol)
(immediate_symbol))
(symbol)
(symbol))
(named_import
(dotted_identifier
(symbol)
(immediate_symbol))
(symbol)
(aliased_import
(symbol)
(symbol))
(aliased_import
(symbol)
(symbol)))
(module_import
(aliased_import
(symbol)
(symbol)))
(module_import
(symbol))))
=======
require
=======
(require mymodule)
(require mymodule :as M)
(require mymodule [foo])
(require mymodule *)
(require mymodule [foo :as bar])
(require mymodule :macros [foo] :readers [spiff])
(require mymodule
mymodule :readers [spiff])
---
(source_file
(require
(module_import
(symbol)))
(require
(module_import
(aliased_import
(symbol)
(symbol))))
(require
(named_import
(symbol)
(symbol)))
(require
(module_import
(symbol)))
(require
(named_import
(symbol)
(aliased_import
(symbol)
(symbol))))
(require
(namespace_require
(symbol)
(symbol)
(symbol)))
(require
(module_import
(symbol))
(namespace_require
(symbol)
(symbol))))
========
function
========
(defn name [params] bodyform1 bodyform2)
(defn :async [decorator1 decorator2] :tp [T1 T2] #^ annotation name [params])
(defn f [a / b [c 3] * d e #** kwargs]
[a b c d e kwargs])
---
(source_file
(function
(symbol)
(parameter_list
(symbol))
(symbol)
(symbol))
(function
(variable_list
(symbol)
(symbol))
(type_parameters
(symbol)
(symbol))
(type_annotation
(symbol))
(symbol)
(parameter_list
(symbol)))
(function
(symbol)
(parameter_list
(symbol)
(symbol)
(symbol)
(integer)
(symbol)
(symbol)
(symbol))
(list
(symbol)
(symbol)
(symbol)
(symbol)
(symbol)
(symbol))))
======
lambda
======
(fn [x] (print x))
(fn :async [x])
---
(source_file
(lambda
(parameter_list
(symbol))
(expression
(symbol)
(symbol)))
(lambda
(parameter_list
(symbol))))
=====
class
=====
(defclass [decorator1 decorator2] :tp [T1 T2] MyClass [SuperClass1 SuperClass2]
"A class that does things at times."
(setv
attribute1 value1
attribute2 value2)
(defn method1 [self arg1 arg2])
(defn method2 [self arg1 arg2]))
---
(source_file
(class
(variable_list
(symbol)
(symbol))
(type_parameters
(symbol)
(symbol))
(symbol)
(variable_list
(symbol)
(symbol))
(string)
(expression
(symbol)
(symbol)
(symbol)
(symbol)
(symbol))
(function
(symbol)
(parameter_list
(symbol)
(symbol)
(symbol)))
(function
(symbol)
(parameter_list
(symbol)
(symbol)
(symbol)))))
=====
macro
=====
(defmacro hypotenuse [a b]
(import math)
`(math.sqrt (+ (** ~a 2) (** ~b 2))))
---
(source_file
(macro
(symbol)
(parameter_list
(symbol)
(symbol))
(import
(module_import
(symbol)))
(sugar)
(expression
(dotted_identifier
(symbol)
(immediate_symbol))
(expression
(symbol)
(expression
(symbol)
(sugar)
(symbol)
(integer))
(expression
(symbol)
(sugar)
(symbol)
(integer))))))
======
reader
======
(defreader hi
'(print "Hello."))
---
(source_file
(reader
(symbol)
(sugar)
(expression
(symbol)
(string))))

View file

@ -0,0 +1,266 @@
#!/usr/bin/env hy
(import xml.etree.ElementTree :as ET)
(require hyrule [-> doto meth ncut])
(import catboost :as cb)
(import numpy :as np)
(import pandas :as pd)
(import rdkit [Chem RDLogger])
(import tqdm [tqdm])
(import maplight-gnn)
(defclass DrugBank []
(setv namespaces {"" "http://www.drugbank.ca"})
(defmacro ap-find [element name if-found]
`(do
(setv it (.find ~element ~name self.namespaces))
(if-let it ~if-found)))
(defmacro if-let [maybe execute]
`(when (is-not ~maybe None)
~execute))
(meth __init__ [@filename @ids @id-types names]
(setv @names (.str.lower names))
(setv @get-ids {"ChEBI" @chebi
"ChEMBL" @chembl
"drugbank-id" @drugbank
"InChIKey" @inchikey
"PubChem Compound" @pubchem-compound
"PubChem Substance" @pubchem-substance
"unii" @unii}))
(meth get-matches []
(for [#(_ element) (tqdm (ET.iterparse @filename ["end"]))]
;; don't care about non-drug entries
(when (!= (cut element.tag 24 None) "drug")
(continue))
(setv matches (@check-match element))
;; make sure there are matches before doing more work
(when (not (matches.any))
(continue))
(yield #(matches element))))
(meth check-match [element]
(setv matches (pd.Series False :index @ids.index))
(for [#(id-type id-func) (.items @get-ids)]
(setv id-val (id-func element))
(when (is id-val None) (continue))
(setv id-matches (& (= @id-types id-type) (= @ids id-val)))
(setv matches (| matches id-matches)))
;; names can't use the same logic as the other id types
(setv #(generic-names brand-names) (@all-names element))
(setv matches (| matches (@names.isin generic-names)))
(setv matches (| matches (@names.isin brand-names)))
(return matches))
(meth all-names [element]
(setv generic-names (set))
(setv brand-names (set))
(setv main-name (@name element))
(when (is-not main-name None) (generic-names.add (.lower main-name)))
(ap-find element "synonyms"
(for [synonym (.iter it)]
(when (and (is-not synonym None) (is-not synonym.text None))
(generic-names.add (.lower synonym.text)))))
(ap-find element "products"
(for [product (.iter it)]
(setv brand-name (product.find "name" @namespaces))
(if-let brand-name (brand-names.add (.lower brand-name.text)))))
(setv generic-names (tuple (filter (fn [s] (not-in "\n" s)) generic-names)))
(setv brand-names (tuple (filter (fn [s] (not-in "\n" s)) brand-names)))
(return #(generic-names brand-names)))
(meth cas-number [element]
(ap-find element "cas-number" it.text))
(meth chebi [element]
(@from-external-identifiers element "ChEBI"))
(meth chembl [element]
(@from-external-identifiers element "ChEMBL"))
(meth drugbank [element]
(ap-find element "drugbank-id" it.text))
(meth fda-approval [element]
(ap-find element "groups" (in "approved" (tuple (it.itertext)))))
(meth inchikey [element]
(@from-calculated-properties element "InChIKey"))
(meth indication [element]
(ap-find element "indication" it.text))
(meth mechanism [element]
(ap-find element "mechanism-of-action" it.text))
(meth name [element]
(ap-find element "name" it.text))
(meth prices [element]
(ap-find element "prices"
(do
(setv prices (list))
(for [price-element (it.iterfind "price" @namespaces)]
(setv price (price-element.find "cost" @namespaces))
(if-let price (.append prices (+ price.text (price.attrib.get "currency")))))
(return prices))))
(meth pubchem-compound [element]
(@from-external-identifiers element "PubChem Compound"))
(meth pubchem-substance [element]
(@from-external-identifiers element "PubChem Substance"))
(meth smiles [element]
(@from-calculated-properties element "SMILES"))
(meth unii [element]
(ap-find element "unii" it.text))
(meth from-external-identifiers [element resource-type]
(ap-find element "external-identifiers"
(for [external-identifier (it.iterfind "external-identifier" @namespaces)]
(when (= (external-identifier.findtext "resource" :namespaces @namespaces) resource-type)
(return (external-identifier.findtext "identifier" :namespaces @namespaces))))))
(meth from-calculated-properties [element kind-type]
(ap-find element "calculated-properties"
(for [property (it.iterfind "property" @namespaces)]
(when (= (property.findtext "kind" :namespaces @namespaces) kind-type)
(return (property.findtext "value" :namespaces @namespaces)))))))
(defclass DataAugmenter []
(defmacro create-var-column [var-name col-name col-initial-value]
`(do
(setv ~var-name ~col-name)
(setv (get self.drug-list ~var-name) ~col-initial-value)))
(meth __init__ [@filename]
(setv @drug-list None)
(setv @admet-models None))
(meth load-drug-queries []
(cond
(@filename.endswith ".csv")
(with [f (open @filename "r")]
(setv @drug-list (pd.read-csv f)))
(@filename.endswith ".json")
(with [f (open @filename "r")]
(setv @drug-list (pd.read-json f :orient "records")))
True
(raise (ValueError "Data file must be .csv or .json")))
(return self))
(meth load-admet-models [models]
(setv @admet-models (dict))
(for [#(name path) (models.items)]
(setv model (cb.CatBoostClassifier))
(model.load-model path)
(setv (get @admet-models name) model))
(return self))
(meth save-drug-info [filename]
(when (is @drug-list None)
(raise (ValueError "drug-list must be loaded first.")))
(with [f (open filename "w")]
(@drug-list.to-json f :orient "records")))
(meth match-drugbank [filename id-col-name id-type-col-name name-col-name]
(when (is @drug-list None)
(raise (ValueError "drug-list is not defined. Call load-drug-queries before match-drugbank.")))
;; make sure the cols are strings and not lists of strings
(setv unwrap-list (fn [x] (if (isinstance x list) (get x 0) x)))
(setv id-col (.apply (get @drug-list id-col-name) unwrap-list))
(setv id-type-col (.apply (get @drug-list id-type-col-name) unwrap-list))
(setv name-col (.apply (get @drug-list name-col-name) unwrap-list))
;; tedious column making for what we're about to store
;; variable name, column title, initial value
(create-var-column cas-column "CAS Registry Number" None)
(create-var-column fda-column "FDA Approved" None)
(create-var-column indication-column "Indication" None)
(create-var-column mechanism-column "Mechanism" None)
(create-var-column name-column "DrugBank Name" None)
(create-var-column price-column "Prices" (@drug-list.apply (fn [_] (list)) :axis 1))
(create-var-column smiles-column "SMILES" None)
(create-var-column unii-column "UNII" None)
(setv drugbank (DrugBank filename id-col id-type-col name-col))
(for [#(matches element) (drugbank.get-matches)]
(setv (ncut @drug-list.loc matches cas-column) (drugbank.cas-number element))
(setv (ncut @drug-list.loc matches fda-column) (drugbank.fda-approval element))
(setv (ncut @drug-list.loc matches indication-column) (drugbank.indication element))
(setv (ncut @drug-list.loc matches mechanism-column) (drugbank.mechanism element))
(setv (ncut @drug-list.loc matches name-column) (drugbank.name element))
(setv (ncut @drug-list.loc matches price-column)
(.apply (ncut @drug-list.loc matches price-column) (fn [_] (drugbank.prices element)))) ; prices is a list
(setv (ncut @drug-list.loc matches smiles-column) (drugbank.smiles element))
(setv (ncut @drug-list.loc matches unii-column) (drugbank.unii element))))
(meth deduplicate []
(when (is @drug-list None)
(raise (ValueError "drug-list is not defined. Call load-drug-queries before deduplicate.")))
(when (not-in "DrugBank Name" @drug-list.columns)
(raise (ValueError "ID data does not exist yet. Run match-drugbank to create it.")))
(setv @drug-list
(-> @drug-list
(.groupby "DrugBank Name")
(.agg
(fn [x]
(setv y [])
(for [item x]
(if (isinstance item list)
(y.extend item)
(y.append item)))
(setv z (set y))
(z.discard None)
(cond
(= (len z) 0) None
(= (len z) 1) (.pop z)
True z)))
(.reset-index))))
(meth predict-admet []
(when (is @drug-list None)
(raise (ValueError "drug-list is not defined. Call load-drug-queries before predict-admet.")))
(when (is @admet-models None)
(raise (ValueError "admet-models is not defined. Call load-admet-models before predict-admet.")))
(when (not-in "SMILES" @drug-list.columns)
(raise (ValueError "SMILES data does not exist yet. Run match-drugbank to create it.")))
(RDLogger.DisableLog "rdApp.*")
(setv smiles-mask (.notna (get @drug-list "SMILES")))
(setv smiles (ncut @drug-list.loc smiles-mask "SMILES"))
(setv molecules (smiles.apply Chem.MolFromSmiles))
(setv molecules-mask (.notna molecules))
(setv fingerprints (@get-fingerprints (get molecules molecules-mask)))
(setv combined-mask (pd.Series False :index @drug-list.index))
(setv (ncut combined-mask.loc (. (get smiles molecules-mask) index)) True)
(for [#(name model) (@admet-models.items)]
(setv predictions (model.predict-proba fingerprints))
(setv (ncut @drug-list.loc combined-mask name) (ncut predictions : 1))))
(meth get-fingerprints [molecules]
(setv fingerprints (list))
(fingerprints.append (maplight-gnn.get-morgan-fingerprints molecules))
(fingerprints.append (maplight-gnn.get-avalon-fingerprints molecules))
(fingerprints.append (maplight-gnn.get-erg-fingerprints molecules))
(fingerprints.append (maplight-gnn.get-rdkit-features molecules))
(fingerprints.append (maplight-gnn.get-gin-supervised-masking molecules))
(np.concatenate fingerprints :axis 1)))
(when (= __name__ "__main__")
(setv augmenter
(-> (DataAugmenter "data/translator_drugs.json")
(.load-drug-queries)
(.load-admet-models {"Blood Brain Barrier" "data/admet/bbb_martins-0.916-0.002.dump" "Bioavailability" "data/admet/bioavailability_ma-0.74-0.01.dump" "Human Intestinal Absorption" "data/admet/hia_hou-0.989-0.001.dump"})))
(doto augmenter
(.match-drugbank "data/src/drugbank.xml" "result_id" "id_type" "result_name")
(.deduplicate)
(.predict-admet)
(.save-drug-info "data/translator_drug_list.json")))

View file

@ -0,0 +1,347 @@
import hy
import xml.etree.ElementTree as ET
hy.macros.require('hyrule', None, target_module_name='data_augmenter', assignments=[['->', '->'], ['doto', 'doto'], ['meth', 'meth'], ['ncut', 'ncut']], prefix='')
import catboost as cb
import numpy as np
import pandas as pd
from rdkit import Chem, RDLogger
from tqdm import tqdm
import maplight_gnn
class DrugBank:
namespaces = {'': 'http://www.drugbank.ca'}
_hy_local_macro__ap_find = lambda element, name, if_found: hy.models.Expression([hy.models.Symbol('do', from_parser=True), hy.models.Expression([hy.models.Symbol('setv', from_parser=True), hy.models.Symbol('it', from_parser=True), hy.models.Expression([hy.models.Expression([hy.models.Symbol('.', from_parser=True), hy.models.Symbol('None', from_parser=True), hy.models.Symbol('find', from_parser=True)]), element, name, hy.models.Expression([hy.models.Symbol('.', from_parser=True), hy.models.Symbol('self', from_parser=True), hy.models.Symbol('namespaces', from_parser=True)])])]), hy.models.Expression([hy.models.Symbol('if-let', from_parser=True), hy.models.Symbol('it', from_parser=True), if_found])])
_hy_local_macro__if_let = lambda maybe, execute: hy.models.Expression([hy.models.Symbol('when', from_parser=True), hy.models.Expression([hy.models.Symbol('is-not', from_parser=True), maybe, hy.models.Symbol('None', from_parser=True)]), execute])
def __init__(self, filename, ids, id_types, names):
None
self.filename = filename
self.ids = ids
self.id_types = id_types
self.names = names.str.lower()
self.get_ids = {'ChEBI': self.chebi, 'ChEMBL': self.chembl, 'drugbank-id': self.drugbank, 'InChIKey': self.inchikey, 'PubChem Compound': self.pubchem_compound, 'PubChem Substance': self.pubchem_substance, 'unii': self.unii}
def get_matches(self):
None
for (_, element) in tqdm(ET.iterparse(self.filename, ['end'])):
if element.tag[24:None:None] != 'drug':
continue
_hy_anon_var_1 = None
else:
_hy_anon_var_1 = None
matches = self.check_match(element)
if not matches.any():
continue
_hy_anon_var_2 = None
else:
_hy_anon_var_2 = None
yield (matches, element)
def check_match(self, element):
None
matches = pd.Series(False, index=self.ids.index)
for (id_type, id_func) in self.get_ids.items():
id_val = id_func(element)
if id_val is None:
continue
_hy_anon_var_3 = None
else:
_hy_anon_var_3 = None
id_matches = (self.id_types == id_type) & (self.ids == id_val)
matches = matches | id_matches
(generic_names, brand_names) = self.all_names(element)
matches = matches | self.names.isin(generic_names)
matches = matches | self.names.isin(brand_names)
return matches
def all_names(self, element):
None
generic_names = set()
brand_names = set()
main_name = self.name(element)
generic_names.add(main_name.lower()) if main_name is not None else None
it = element.find('synonyms', self.namespaces)
if it is not None:
for synonym in it.iter():
generic_names.add(synonym.text.lower()) if synonym is not None and synonym.text is not None else None
_hy_anon_var_4 = None
else:
_hy_anon_var_4 = None
it = element.find('products', self.namespaces)
if it is not None:
for product in it.iter():
brand_name = product.find('name', self.namespaces)
brand_names.add(brand_name.text.lower()) if brand_name is not None else None
_hy_anon_var_5 = None
else:
_hy_anon_var_5 = None
generic_names = tuple(filter(lambda s: '\n' not in s, generic_names))
brand_names = tuple(filter(lambda s: '\n' not in s, brand_names))
return (generic_names, brand_names)
def cas_number(self, element):
None
it = element.find('cas-number', self.namespaces)
return it.text if it is not None else None
def chebi(self, element):
None
return self.from_external_identifiers(element, 'ChEBI')
def chembl(self, element):
None
return self.from_external_identifiers(element, 'ChEMBL')
def drugbank(self, element):
None
it = element.find('drugbank-id', self.namespaces)
return it.text if it is not None else None
def fda_approval(self, element):
None
it = element.find('groups', self.namespaces)
return 'approved' in tuple(it.itertext()) if it is not None else None
def inchikey(self, element):
None
return self.from_calculated_properties(element, 'InChIKey')
def indication(self, element):
None
it = element.find('indication', self.namespaces)
return it.text if it is not None else None
def mechanism(self, element):
None
it = element.find('mechanism-of-action', self.namespaces)
return it.text if it is not None else None
def name(self, element):
None
it = element.find('name', self.namespaces)
return it.text if it is not None else None
def prices(self, element):
None
it = element.find('prices', self.namespaces)
if it is not None:
prices = list()
for price_element in it.iterfind('price', self.namespaces):
price = price_element.find('cost', self.namespaces)
prices.append(price.text + price.attrib.get('currency')) if price is not None else None
return prices
_hy_anon_var_6 = None
else:
_hy_anon_var_6 = None
return _hy_anon_var_6
def pubchem_compound(self, element):
None
return self.from_external_identifiers(element, 'PubChem Compound')
def pubchem_substance(self, element):
None
return self.from_external_identifiers(element, 'PubChem Substance')
def smiles(self, element):
None
return self.from_calculated_properties(element, 'SMILES')
def unii(self, element):
None
it = element.find('unii', self.namespaces)
return it.text if it is not None else None
def from_external_identifiers(self, element, resource_type):
None
it = element.find('external-identifiers', self.namespaces)
if it is not None:
for external_identifier in it.iterfind('external-identifier', self.namespaces):
if external_identifier.findtext('resource', namespaces=self.namespaces) == resource_type:
return external_identifier.findtext('identifier', namespaces=self.namespaces)
_hy_anon_var_7 = None
else:
_hy_anon_var_7 = None
_hy_anon_var_8 = None
else:
_hy_anon_var_8 = None
return _hy_anon_var_8
def from_calculated_properties(self, element, kind_type):
None
it = element.find('calculated-properties', self.namespaces)
if it is not None:
for property in it.iterfind('property', self.namespaces):
if property.findtext('kind', namespaces=self.namespaces) == kind_type:
return property.findtext('value', namespaces=self.namespaces)
_hy_anon_var_9 = None
else:
_hy_anon_var_9 = None
_hy_anon_var_10 = None
else:
_hy_anon_var_10 = None
return _hy_anon_var_10
class DataAugmenter:
_hy_local_macro__create_var_column = lambda var_name, col_name, col_initial_value: hy.models.Expression([hy.models.Symbol('do', from_parser=True), hy.models.Expression([hy.models.Symbol('setv', from_parser=True), var_name, col_name]), hy.models.Expression([hy.models.Symbol('setv', from_parser=True), hy.models.Expression([hy.models.Symbol('get', from_parser=True), hy.models.Expression([hy.models.Symbol('.', from_parser=True), hy.models.Symbol('self', from_parser=True), hy.models.Symbol('drug-list', from_parser=True)]), var_name]), col_initial_value])])
def __init__(self, filename):
None
self.filename = filename
self.drug_list = None
self.admet_models = None
def load_drug_queries(self):
None
if self.filename.endswith('.csv'):
_hy_anon_var_11 = None
with open(self.filename, 'r') as f:
self.drug_list = pd.read_csv(f)
_hy_anon_var_11 = None
_hy_anon_var_15 = _hy_anon_var_11
else:
if self.filename.endswith('.json'):
_hy_anon_var_12 = None
with open(self.filename, 'r') as f:
self.drug_list = pd.read_json(f, orient='records')
_hy_anon_var_12 = None
_hy_anon_var_14 = _hy_anon_var_12
else:
if True:
raise ValueError('Data file must be .csv or .json')
_hy_anon_var_13 = None
else:
_hy_anon_var_13 = None
_hy_anon_var_14 = _hy_anon_var_13
_hy_anon_var_15 = _hy_anon_var_14
return self
def load_admet_models(self, models):
None
self.admet_models = dict()
for (name, path) in models.items():
model = cb.CatBoostClassifier()
model.load_model(path)
self.admet_models[name] = model
return self
def save_drug_info(self, filename):
None
if self.drug_list is None:
raise ValueError('drug-list must be loaded first.')
_hy_anon_var_16 = None
else:
_hy_anon_var_16 = None
_hy_anon_var_17 = None
with open(filename, 'w') as f:
_hy_anon_var_17 = self.drug_list.to_json(f, orient='records')
return _hy_anon_var_17
def match_drugbank(self, filename, id_col_name, id_type_col_name, name_col_name):
None
if self.drug_list is None:
raise ValueError('drug-list is not defined. Call load-drug-queries before match-drugbank.')
_hy_anon_var_18 = None
else:
_hy_anon_var_18 = None
unwrap_list = lambda x: x[0] if isinstance(x, list) else x
id_col = self.drug_list[id_col_name].apply(unwrap_list)
id_type_col = self.drug_list[id_type_col_name].apply(unwrap_list)
name_col = self.drug_list[name_col_name].apply(unwrap_list)
cas_column = 'CAS Registry Number'
self.drug_list[cas_column] = None
fda_column = 'FDA Approved'
self.drug_list[fda_column] = None
indication_column = 'Indication'
self.drug_list[indication_column] = None
mechanism_column = 'Mechanism'
self.drug_list[mechanism_column] = None
name_column = 'DrugBank Name'
self.drug_list[name_column] = None
price_column = 'Prices'
self.drug_list[price_column] = self.drug_list.apply(lambda _: list(), axis=1)
smiles_column = 'SMILES'
self.drug_list[smiles_column] = None
unii_column = 'UNII'
self.drug_list[unii_column] = None
drugbank = DrugBank(filename, id_col, id_type_col, name_col)
for (matches, element) in drugbank.get_matches():
self.drug_list.loc[matches, cas_column] = drugbank.cas_number(element)
self.drug_list.loc[matches, fda_column] = drugbank.fda_approval(element)
self.drug_list.loc[matches, indication_column] = drugbank.indication(element)
self.drug_list.loc[matches, mechanism_column] = drugbank.mechanism(element)
self.drug_list.loc[matches, name_column] = drugbank.name(element)
self.drug_list.loc[matches, price_column] = self.drug_list.loc[matches, price_column].apply(lambda _: drugbank.prices(element))
self.drug_list.loc[matches, smiles_column] = drugbank.smiles(element)
self.drug_list.loc[matches, unii_column] = drugbank.unii(element)
def deduplicate(self):
None
if self.drug_list is None:
raise ValueError('drug-list is not defined. Call load-drug-queries before deduplicate.')
_hy_anon_var_19 = None
else:
_hy_anon_var_19 = None
if 'DrugBank Name' not in self.drug_list.columns:
raise ValueError('ID data does not exist yet. Run match-drugbank to create it.')
_hy_anon_var_20 = None
else:
_hy_anon_var_20 = None
def _hy_anon_var_21(x):
y = []
for item in x:
y.extend(item) if isinstance(item, list) else y.append(item)
z = set(y)
z.discard(None)
return None if len(z) == 0 else z.pop() if len(z) == 1 else z if True else None
self.drug_list = self.drug_list.groupby('DrugBank Name').agg(_hy_anon_var_21).reset_index()
def predict_admet(self):
None
if self.drug_list is None:
raise ValueError('drug-list is not defined. Call load-drug-queries before predict-admet.')
_hy_anon_var_22 = None
else:
_hy_anon_var_22 = None
if self.admet_models is None:
raise ValueError('admet-models is not defined. Call load-admet-models before predict-admet.')
_hy_anon_var_23 = None
else:
_hy_anon_var_23 = None
if 'SMILES' not in self.drug_list.columns:
raise ValueError('SMILES data does not exist yet. Run match-drugbank to create it.')
_hy_anon_var_24 = None
else:
_hy_anon_var_24 = None
RDLogger.DisableLog('rdApp.*')
smiles_mask = self.drug_list['SMILES'].notna()
smiles = self.drug_list.loc[smiles_mask, 'SMILES']
molecules = smiles.apply(Chem.MolFromSmiles)
molecules_mask = molecules.notna()
fingerprints = self.get_fingerprints(molecules[molecules_mask])
combined_mask = pd.Series(False, index=self.drug_list.index)
combined_mask.loc[smiles[molecules_mask].index] = True
for (name, model) in self.admet_models.items():
predictions = model.predict_proba(fingerprints)
self.drug_list.loc[combined_mask, name] = predictions[slice(None, None), 1]
def get_fingerprints(self, molecules):
None
fingerprints = list()
fingerprints.append(maplight_gnn.get_morgan_fingerprints(molecules))
fingerprints.append(maplight_gnn.get_avalon_fingerprints(molecules))
fingerprints.append(maplight_gnn.get_erg_fingerprints(molecules))
fingerprints.append(maplight_gnn.get_rdkit_features(molecules))
fingerprints.append(maplight_gnn.get_gin_supervised_masking(molecules))
return np.concatenate(fingerprints, axis=1)
if __name__ == '__main__':
augmenter = DataAugmenter('data/translator_drugs.json').load_drug_queries().load_admet_models({'Blood Brain Barrier': 'data/admet/bbb_martins-0.916-0.002.dump', 'Bioavailability': 'data/admet/bioavailability_ma-0.74-0.01.dump', 'Human Intestinal Absorption': 'data/admet/hia_hou-0.989-0.001.dump'})
_hy_gensym_f_1 = augmenter
_hy_gensym_f_1.match_drugbank('data/src/drugbank.xml', 'result_id', 'id_type', 'result_name')
_hy_gensym_f_1.deduplicate()
_hy_gensym_f_1.predict_admet()
_hy_gensym_f_1.save_drug_info('data/translator_drug_list.json')
_hy_anon_var_25 = _hy_gensym_f_1
else:
_hy_anon_var_25 = None