mirror of
https://github.com/elladunbar/tree-sitter-hy.git
synced 2025-11-05 14:59:55 -06:00
Add more structured syntax
This commit is contained in:
parent
d330f85c67
commit
66026fe74b
11 changed files with 18956 additions and 4528 deletions
271
grammar.js
271
grammar.js
|
|
@ -9,9 +9,8 @@
|
||||||
|
|
||||||
const regexp = {
|
const regexp = {
|
||||||
ascii_whitespace: /[\u0009\u000A\u000B\u000C\u000D\u0020]/,
|
ascii_whitespace: /[\u0009\u000A\u000B\u000C\u000D\u0020]/,
|
||||||
symbol_seq: /[^()\[\]{};"'`~:.\d\u0009\u000A\u000B\u000C\u000D\u0020][^()\[\]{};"'`~.\u0009\u000A\u000B\u000C\u000D\u0020]*/
|
|
||||||
}
|
}
|
||||||
const symbol_seq_immediate = token.immediate(regexp.symbol_seq)
|
const plus_minus = choice('+', '-')
|
||||||
const digitpart = seq(/\d/, repeat(/_*\d+/))
|
const digitpart = seq(/\d/, repeat(/_*\d+/))
|
||||||
const pointfloat = choice(
|
const pointfloat = choice(
|
||||||
seq(optional(digitpart), '.', digitpart),
|
seq(optional(digitpart), '.', digitpart),
|
||||||
|
|
@ -19,7 +18,7 @@ const pointfloat = choice(
|
||||||
)
|
)
|
||||||
const exponentfloat = seq(
|
const exponentfloat = seq(
|
||||||
choice(digitpart, pointfloat),
|
choice(digitpart, pointfloat),
|
||||||
seq(/[eE]/, optional(/[+-]/), digitpart),
|
seq(choice('e', 'E'), optional(plus_minus), digitpart),
|
||||||
)
|
)
|
||||||
|
|
||||||
module.exports = grammar({
|
module.exports = grammar({
|
||||||
|
|
@ -30,50 +29,65 @@ module.exports = grammar({
|
||||||
$.comment,
|
$.comment,
|
||||||
],
|
],
|
||||||
|
|
||||||
// word: $ => $.symbol,
|
word: $ => $.symbol,
|
||||||
|
|
||||||
rules: {
|
rules: {
|
||||||
|
// SYNTACTIC ELEMENTS
|
||||||
source_file: $ => seq(optional($.shebang), repeat($._element)),
|
source_file: $ => seq(optional($.shebang), repeat($._element)),
|
||||||
|
|
||||||
shebang: _ => token(seq('#!', /.*/)),
|
shebang: _ => token(seq('#!', /.*/)),
|
||||||
_element: $ => choice($._form, $.discard, $.comment),
|
_element: $ => choice(
|
||||||
|
$._form,
|
||||||
|
$.discard,
|
||||||
|
$.comment,
|
||||||
|
$.import,
|
||||||
|
$.require,
|
||||||
|
$.function,
|
||||||
|
$.lambda,
|
||||||
|
$.class,
|
||||||
|
$.macro,
|
||||||
|
$.reader,
|
||||||
|
),
|
||||||
|
|
||||||
_form: $ => seq(optional($._sugar), choice($._identifier, $._sequence, $._string)),
|
_form: $ => seq(optional($.sugar), choice($._identifier, $._sequence, $._string)),
|
||||||
discard: $ => seq('#_', $._form),
|
discard: $ => seq('#_', $._form),
|
||||||
comment: _ => token(seq(';', /.*/)),
|
comment: _ => token(seq(';', /.*/)),
|
||||||
|
|
||||||
_sugar: _ => choice(
|
sugar: _ => choice(
|
||||||
field('quote', '\''),
|
'\'',
|
||||||
field('quasiquote', '`'),
|
'`',
|
||||||
field('unqoute', '~'),
|
'~',
|
||||||
field('unqoute_splice', '~@'),
|
'~@',
|
||||||
field('unpack_iterable', '#*'),
|
'#*',
|
||||||
field('unpack_mapping', '#**'),
|
'#**',
|
||||||
),
|
),
|
||||||
_identifier: $ => choice(
|
_identifier: $ => choice(
|
||||||
$._numeric_literal,
|
$._numeric_literal,
|
||||||
$.keyword,
|
$.keyword,
|
||||||
$.symbol,
|
$._symbol_or_dots,
|
||||||
$.dotted_identifier,
|
$.dotted_identifier,
|
||||||
),
|
),
|
||||||
_sequence: $ => choice($.expression, $.list, $.tuple, $.set, $.dictionary),
|
_sequence: $ => choice($.expression, $.list, $.tuple, $.set, $.dictionary),
|
||||||
_string: $ => choice($.string, $.bracket_string),
|
_string: $ => choice($.string, $.bracket_string),
|
||||||
|
|
||||||
_numeric_literal: $ => choice($.integer, $.float, $.complex),
|
_numeric_literal: $ => choice($.integer, $.float, $.complex),
|
||||||
keyword: _ => token(seq(':', optional(regexp.symbol_seq))),
|
keyword: $ => prec.right(seq(
|
||||||
dotted_identifier: _ => prec(1, choice(
|
':',
|
||||||
|
optional($.immediate_symbol),
|
||||||
|
)),
|
||||||
|
_symbol_or_dots: $ => choice(
|
||||||
|
$.symbol,
|
||||||
|
$.dots,
|
||||||
|
),
|
||||||
|
dotted_identifier: $ => choice(
|
||||||
seq(
|
seq(
|
||||||
/[.]+/,
|
/[.]+/,
|
||||||
symbol_seq_immediate,
|
$.immediate_symbol,
|
||||||
repeat(seq(token.immediate('.'), symbol_seq_immediate)),
|
repeat(seq(token.immediate('.'), $.immediate_symbol)),
|
||||||
),
|
),
|
||||||
seq(
|
seq(
|
||||||
regexp.symbol_seq,
|
field("sym", $.symbol),
|
||||||
repeat1((seq(token.immediate('.'), symbol_seq_immediate)))),
|
repeat1(seq(token.immediate('.'), $.immediate_symbol))),
|
||||||
)),
|
|
||||||
symbol: _ => choice(
|
|
||||||
/[.]+/,
|
|
||||||
regexp.symbol_seq,
|
|
||||||
),
|
),
|
||||||
|
|
||||||
expression: $ => seq('(', repeat1($._element), ')'),
|
expression: $ => seq('(', repeat1($._element), ')'),
|
||||||
|
|
@ -91,32 +105,219 @@ module.exports = grammar({
|
||||||
'}'
|
'}'
|
||||||
),
|
),
|
||||||
|
|
||||||
string: _ => token(seq(
|
string: _ => seq(
|
||||||
/[rbf]{0,3}/,
|
/[rbf]*"/,
|
||||||
'"',
|
field("content", /[^"]*/),
|
||||||
/[^"]*/,
|
|
||||||
'"'
|
'"'
|
||||||
)),
|
),
|
||||||
bracket_string: _ => token(seq('#[[', /[^\]]*/, ']]')),
|
bracket_string: _ => seq(
|
||||||
|
'#[[',
|
||||||
|
field("content", /[^\]]*/),
|
||||||
|
']]'
|
||||||
|
),
|
||||||
|
|
||||||
integer: $ => choice($._decinteger, $._bininteger, $._octinteger, $._hexinteger),
|
integer: $ => choice($._decinteger, $._bininteger, $._octinteger, $._hexinteger),
|
||||||
float: _ => token(prec(1, seq(
|
float: _ => token(prec(1, seq(
|
||||||
optional(/[+-]/),
|
optional(plus_minus),
|
||||||
choice(pointfloat, exponentfloat, 'Inf', 'NaN'),
|
choice(pointfloat, exponentfloat, 'Inf', 'NaN'),
|
||||||
))),
|
))),
|
||||||
complex: _ => token(prec(1, seq(
|
complex: _ => token(prec(1, seq(
|
||||||
optional(/[+-]/),
|
optional(plus_minus),
|
||||||
choice(pointfloat, exponentfloat, digitpart, 'Inf', 'NaN'),
|
choice(pointfloat, exponentfloat, digitpart, 'Inf', 'NaN'),
|
||||||
/[+-]/,
|
plus_minus,
|
||||||
seq(
|
seq(
|
||||||
choice(pointfloat, exponentfloat, digitpart, 'NaN', 'Inf'),
|
choice(pointfloat, exponentfloat, digitpart, 'NaN', 'Inf'),
|
||||||
/[jJ]/,
|
choice('j', 'J'),
|
||||||
),
|
),
|
||||||
))),
|
))),
|
||||||
|
|
||||||
_decinteger: _ => token(prec(1, seq(optional(/[+-]/), /\d/, repeat(/[,_]*\d+/)))),
|
symbol: _ => token(seq(
|
||||||
|
/[^()\[\]{};"'`~:.\d\u0009\u000A\u000B\u000C\u000D\u0020]/,
|
||||||
|
repeat(/[^()\[\]{};"'`~.\u0009\u000A\u000B\u000C\u000D\u0020]/),
|
||||||
|
)),
|
||||||
|
immediate_symbol: _ => token.immediate(seq(
|
||||||
|
/[^()\[\]{};"'`~:.\d\u0009\u000A\u000B\u000C\u000D\u0020]/,
|
||||||
|
repeat(/[^()\[\]{};"'`~.\u0009\u000A\u000B\u000C\u000D\u0020]/),
|
||||||
|
)),
|
||||||
|
dots: _ => /[.]+/,
|
||||||
|
|
||||||
|
_decinteger: _ => token(prec(1, seq(optional(plus_minus), /\d/, repeat(/[,_]*\d+/)))),
|
||||||
_bininteger: _ => token(prec(1, seq('0', /[bB]/, repeat(/[,_]*[01]+/)))),
|
_bininteger: _ => token(prec(1, seq('0', /[bB]/, repeat(/[,_]*[01]+/)))),
|
||||||
_octinteger: _ => token(prec(1, seq('0', /[oO]/, repeat(/[,_]*[0-7]+/)))),
|
_octinteger: _ => token(prec(1, seq('0', /[oO]/, repeat(/[,_]*[0-7]+/)))),
|
||||||
_hexinteger: _ => token(prec(1, seq('0', /[xX]/, repeat(/[,_]*[\da-fA-F]+/)))),
|
_hexinteger: _ => token(prec(1, seq('0', /[xX]/, repeat(/[,_]*[\da-fA-F]+/)))),
|
||||||
|
|
||||||
|
// STRUCTURED SYNTAX
|
||||||
|
import: $ => seq(
|
||||||
|
'(',
|
||||||
|
'import',
|
||||||
|
repeat1(
|
||||||
|
choice(
|
||||||
|
$.module_import,
|
||||||
|
$.named_import,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
')'
|
||||||
|
),
|
||||||
|
require: $ => seq(
|
||||||
|
'(',
|
||||||
|
'require',
|
||||||
|
repeat1(
|
||||||
|
choice(
|
||||||
|
$.module_import,
|
||||||
|
$.named_import,
|
||||||
|
$.namespace_require,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
')'
|
||||||
|
),
|
||||||
|
function: $ => seq(
|
||||||
|
'(',
|
||||||
|
'defn',
|
||||||
|
optional(
|
||||||
|
seq(
|
||||||
|
':',
|
||||||
|
token.immediate('async'),
|
||||||
|
)
|
||||||
|
),
|
||||||
|
field('decorators', optional($.variable_list)),
|
||||||
|
optional($.type_parameters),
|
||||||
|
optional($.type_annotation),
|
||||||
|
field('name', $.symbol),
|
||||||
|
$.parameter_list,
|
||||||
|
repeat($._element),
|
||||||
|
')',
|
||||||
|
),
|
||||||
|
lambda: $ => seq(
|
||||||
|
'(',
|
||||||
|
'fn',
|
||||||
|
optional(
|
||||||
|
seq(
|
||||||
|
':',
|
||||||
|
token.immediate('async'),
|
||||||
|
)
|
||||||
|
),
|
||||||
|
$.parameter_list,
|
||||||
|
repeat($._element),
|
||||||
|
')',
|
||||||
|
),
|
||||||
|
class: $ => seq(
|
||||||
|
'(',
|
||||||
|
'defclass',
|
||||||
|
field('decorators', optional($.variable_list)),
|
||||||
|
optional($.type_parameters),
|
||||||
|
field('name', $.symbol),
|
||||||
|
field('superclasses', $.variable_list),
|
||||||
|
repeat($._element),
|
||||||
|
')',
|
||||||
|
),
|
||||||
|
macro: $ => seq(
|
||||||
|
'(',
|
||||||
|
'defmacro',
|
||||||
|
field('name', $.symbol),
|
||||||
|
$.parameter_list,
|
||||||
|
repeat($._element),
|
||||||
|
')',
|
||||||
|
),
|
||||||
|
reader: $ => seq(
|
||||||
|
'(',
|
||||||
|
'defreader',
|
||||||
|
field('name', $.symbol),
|
||||||
|
repeat($._element),
|
||||||
|
')',
|
||||||
|
),
|
||||||
|
|
||||||
|
module_import: $ => seq(
|
||||||
|
choice(
|
||||||
|
seq($._variable, optional('*')),
|
||||||
|
$.aliased_import,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
named_import: $ => seq(
|
||||||
|
$._variable,
|
||||||
|
seq(
|
||||||
|
'[',
|
||||||
|
repeat1(
|
||||||
|
choice(
|
||||||
|
$.symbol,
|
||||||
|
$.aliased_import,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
']',
|
||||||
|
),
|
||||||
|
),
|
||||||
|
namespace_require: $ => seq(
|
||||||
|
$._variable,
|
||||||
|
choice(
|
||||||
|
repeat1(
|
||||||
|
seq(
|
||||||
|
':',
|
||||||
|
choice(
|
||||||
|
token.immediate('macros'),
|
||||||
|
token.immediate('readers'),
|
||||||
|
),
|
||||||
|
'[',
|
||||||
|
repeat1(
|
||||||
|
choice(
|
||||||
|
$.symbol,
|
||||||
|
$.aliased_import,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
']',
|
||||||
|
),
|
||||||
|
),
|
||||||
|
seq(
|
||||||
|
$.keyword,
|
||||||
|
'*',
|
||||||
|
),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
|
||||||
|
variable_list: $ => seq(
|
||||||
|
'[',
|
||||||
|
repeat1($._variable),
|
||||||
|
']',
|
||||||
|
),
|
||||||
|
type_parameters: $ => seq(
|
||||||
|
':',
|
||||||
|
token.immediate('tp'),
|
||||||
|
'[',
|
||||||
|
repeat1($._variable),
|
||||||
|
']',
|
||||||
|
),
|
||||||
|
type_annotation: $ => seq(
|
||||||
|
'#^',
|
||||||
|
field('type', $._variable),
|
||||||
|
),
|
||||||
|
parameter_list: $ => seq(
|
||||||
|
'[',
|
||||||
|
repeat(
|
||||||
|
choice(
|
||||||
|
$.symbol,
|
||||||
|
seq(
|
||||||
|
'[',
|
||||||
|
$.symbol,
|
||||||
|
$._form,
|
||||||
|
']',
|
||||||
|
),
|
||||||
|
'/',
|
||||||
|
'*',
|
||||||
|
'#*',
|
||||||
|
'#**',
|
||||||
|
),
|
||||||
|
),
|
||||||
|
']',
|
||||||
|
),
|
||||||
|
|
||||||
|
_variable: $ => choice(
|
||||||
|
$.symbol,
|
||||||
|
$.dotted_identifier,
|
||||||
|
),
|
||||||
|
|
||||||
|
aliased_import: $ => seq(
|
||||||
|
$._variable,
|
||||||
|
':',
|
||||||
|
token.immediate('as'),
|
||||||
|
$.symbol,
|
||||||
|
),
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -1,16 +1,28 @@
|
||||||
; Variables
|
; Variables
|
||||||
(symbol) @variable
|
[
|
||||||
|
(symbol)
|
||||||
|
(immediate_symbol)
|
||||||
|
] @variable
|
||||||
|
|
||||||
(keyword) @property
|
(keyword) @property
|
||||||
|
|
||||||
; Symbol naming conventions
|
; Symbol naming conventions
|
||||||
((symbol) @type
|
([
|
||||||
|
(symbol)
|
||||||
|
(immediate_symbol)
|
||||||
|
] @type
|
||||||
(#lua-match? @type "^[A-Z].*[a-z]"))
|
(#lua-match? @type "^[A-Z].*[a-z]"))
|
||||||
|
|
||||||
((symbol) @constant
|
([
|
||||||
|
(symbol)
|
||||||
|
(immediate_symbol)
|
||||||
|
] @constant
|
||||||
(#lua-match? @constant "^[A-Z][A-Z0-9_-]*$"))
|
(#lua-match? @constant "^[A-Z][A-Z0-9_-]*$"))
|
||||||
|
|
||||||
((symbol) @constant.builtin
|
([
|
||||||
|
(symbol)
|
||||||
|
(immediate_symbol)
|
||||||
|
] @constant.builtin
|
||||||
(#lua-match? @constant.builtin "^__[a-zA-Z0-9_-]*__$"))
|
(#lua-match? @constant.builtin "^__[a-zA-Z0-9_-]*__$"))
|
||||||
|
|
||||||
((symbol) @constant.builtin
|
((symbol) @constant.builtin
|
||||||
|
|
@ -186,3 +198,6 @@
|
||||||
|
|
||||||
(dotted_identifier
|
(dotted_identifier
|
||||||
"." @punctuation.delimiter)
|
"." @punctuation.delimiter)
|
||||||
|
|
||||||
|
(keyword
|
||||||
|
":" @punctuation.delimiter)
|
||||||
|
|
|
||||||
1178
src/grammar.json
generated
1178
src/grammar.json
generated
File diff suppressed because it is too large
Load diff
2119
src/node-types.json
generated
2119
src/node-types.json
generated
File diff suppressed because it is too large
Load diff
18927
src/parser.c
generated
18927
src/parser.c
generated
File diff suppressed because it is too large
Load diff
|
|
@ -8,9 +8,9 @@ keyword
|
||||||
---
|
---
|
||||||
|
|
||||||
(source_file
|
(source_file
|
||||||
(keyword)
|
(keyword (immediate_symbol))
|
||||||
(expression
|
(expression
|
||||||
(symbol) (keyword) (integer)))
|
(symbol) (keyword (immediate_symbol)) (integer)))
|
||||||
|
|
||||||
==================
|
==================
|
||||||
dotted identifiers
|
dotted identifiers
|
||||||
|
|
@ -25,12 +25,12 @@ dotted identifiers
|
||||||
|
|
||||||
(source_file
|
(source_file
|
||||||
(expression
|
(expression
|
||||||
(dotted_identifier))
|
(dotted_identifier (symbol) (immediate_symbol) (immediate_symbol)))
|
||||||
(expression
|
(expression
|
||||||
(dotted_identifier))
|
(dotted_identifier (immediate_symbol) (immediate_symbol)))
|
||||||
(expression
|
(expression
|
||||||
(dotted_identifier))
|
(dotted_identifier (immediate_symbol) (immediate_symbol)))
|
||||||
(dotted_identifier))
|
(dotted_identifier (immediate_symbol) (immediate_symbol)))
|
||||||
|
|
||||||
======================
|
======================
|
||||||
not dotted identifiers
|
not dotted identifiers
|
||||||
|
|
@ -40,11 +40,14 @@ not dotted identifiers
|
||||||
...
|
...
|
||||||
........
|
........
|
||||||
. .
|
. .
|
||||||
|
(. foo bar)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
(source_file
|
(source_file
|
||||||
(symbol)
|
(dots)
|
||||||
(symbol)
|
(dots)
|
||||||
(symbol)
|
(dots)
|
||||||
(symbol) (symbol))
|
(dots) (dots)
|
||||||
|
(expression
|
||||||
|
(dots) (symbol) (symbol)))
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,9 @@ expression
|
||||||
---
|
---
|
||||||
|
|
||||||
(source_file
|
(source_file
|
||||||
(expression
|
(expression
|
||||||
(symbol) (symbol)))
|
(symbol)
|
||||||
|
(symbol)))
|
||||||
|
|
||||||
====
|
====
|
||||||
list
|
list
|
||||||
|
|
@ -20,9 +21,10 @@ list
|
||||||
---
|
---
|
||||||
|
|
||||||
(source_file
|
(source_file
|
||||||
(list
|
(list
|
||||||
(symbol) (symbol))
|
(symbol)
|
||||||
(list))
|
(symbol))
|
||||||
|
(list))
|
||||||
|
|
||||||
=====
|
=====
|
||||||
tuple
|
tuple
|
||||||
|
|
@ -34,9 +36,10 @@ tuple
|
||||||
---
|
---
|
||||||
|
|
||||||
(source_file
|
(source_file
|
||||||
(tuple
|
(tuple
|
||||||
(symbol) (symbol))
|
(symbol)
|
||||||
(tuple))
|
(symbol))
|
||||||
|
(tuple))
|
||||||
|
|
||||||
===
|
===
|
||||||
set
|
set
|
||||||
|
|
@ -48,9 +51,10 @@ set
|
||||||
---
|
---
|
||||||
|
|
||||||
(source_file
|
(source_file
|
||||||
(set
|
(set
|
||||||
(symbol) (symbol))
|
(symbol)
|
||||||
(set))
|
(symbol))
|
||||||
|
(set))
|
||||||
|
|
||||||
==========
|
==========
|
||||||
dictionary
|
dictionary
|
||||||
|
|
@ -62,9 +66,10 @@ dictionary
|
||||||
---
|
---
|
||||||
|
|
||||||
(source_file
|
(source_file
|
||||||
(dictionary
|
(dictionary
|
||||||
(symbol) (symbol))
|
(symbol)
|
||||||
(dictionary))
|
(symbol))
|
||||||
|
(dictionary))
|
||||||
|
|
||||||
===================
|
===================
|
||||||
function definition
|
function definition
|
||||||
|
|
@ -76,7 +81,9 @@ function definition
|
||||||
---
|
---
|
||||||
|
|
||||||
(source_file
|
(source_file
|
||||||
|
(function
|
||||||
|
(symbol)
|
||||||
|
(parameter_list)
|
||||||
(expression
|
(expression
|
||||||
(symbol) (symbol) (list)
|
(symbol)
|
||||||
(expression
|
(string))))
|
||||||
(symbol) (string))))
|
|
||||||
|
|
|
||||||
287
test/corpus/structured.txt
Normal file
287
test/corpus/structured.txt
Normal file
|
|
@ -0,0 +1,287 @@
|
||||||
|
======
|
||||||
|
import
|
||||||
|
======
|
||||||
|
|
||||||
|
(import sys os.path)
|
||||||
|
(import os.path [exists isdir :as is-dir isfile])
|
||||||
|
(import sys :as systest)
|
||||||
|
(import sys *)
|
||||||
|
(import tests.resources [kwtest function-with-a-dash]
|
||||||
|
os.path [exists
|
||||||
|
isdir :as is-dir
|
||||||
|
isfile :as is-file]
|
||||||
|
sys :as systest
|
||||||
|
math *)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
(source_file
|
||||||
|
(import
|
||||||
|
(module_import
|
||||||
|
(symbol))
|
||||||
|
(module_import
|
||||||
|
(dotted_identifier
|
||||||
|
(symbol)
|
||||||
|
(immediate_symbol))))
|
||||||
|
(import
|
||||||
|
(named_import
|
||||||
|
(dotted_identifier
|
||||||
|
(symbol)
|
||||||
|
(immediate_symbol))
|
||||||
|
(symbol)
|
||||||
|
(aliased_import
|
||||||
|
(symbol)
|
||||||
|
(symbol))
|
||||||
|
(symbol)))
|
||||||
|
(import
|
||||||
|
(module_import
|
||||||
|
(aliased_import
|
||||||
|
(symbol)
|
||||||
|
(symbol))))
|
||||||
|
(import
|
||||||
|
(module_import
|
||||||
|
(symbol)))
|
||||||
|
(import
|
||||||
|
(named_import
|
||||||
|
(dotted_identifier
|
||||||
|
(symbol)
|
||||||
|
(immediate_symbol))
|
||||||
|
(symbol)
|
||||||
|
(symbol))
|
||||||
|
(named_import
|
||||||
|
(dotted_identifier
|
||||||
|
(symbol)
|
||||||
|
(immediate_symbol))
|
||||||
|
(symbol)
|
||||||
|
(aliased_import
|
||||||
|
(symbol)
|
||||||
|
(symbol))
|
||||||
|
(aliased_import
|
||||||
|
(symbol)
|
||||||
|
(symbol)))
|
||||||
|
(module_import
|
||||||
|
(aliased_import
|
||||||
|
(symbol)
|
||||||
|
(symbol)))
|
||||||
|
(module_import
|
||||||
|
(symbol))))
|
||||||
|
|
||||||
|
=======
|
||||||
|
require
|
||||||
|
=======
|
||||||
|
|
||||||
|
(require mymodule)
|
||||||
|
(require mymodule :as M)
|
||||||
|
(require mymodule [foo])
|
||||||
|
(require mymodule *)
|
||||||
|
(require mymodule [foo :as bar])
|
||||||
|
(require mymodule :macros [foo] :readers [spiff])
|
||||||
|
(require mymodule
|
||||||
|
mymodule :readers [spiff])
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
(source_file
|
||||||
|
(require
|
||||||
|
(module_import
|
||||||
|
(symbol)))
|
||||||
|
(require
|
||||||
|
(module_import
|
||||||
|
(aliased_import
|
||||||
|
(symbol)
|
||||||
|
(symbol))))
|
||||||
|
(require
|
||||||
|
(named_import
|
||||||
|
(symbol)
|
||||||
|
(symbol)))
|
||||||
|
(require
|
||||||
|
(module_import
|
||||||
|
(symbol)))
|
||||||
|
(require
|
||||||
|
(named_import
|
||||||
|
(symbol)
|
||||||
|
(aliased_import
|
||||||
|
(symbol)
|
||||||
|
(symbol))))
|
||||||
|
(require
|
||||||
|
(namespace_require
|
||||||
|
(symbol)
|
||||||
|
(symbol)
|
||||||
|
(symbol)))
|
||||||
|
(require
|
||||||
|
(module_import
|
||||||
|
(symbol))
|
||||||
|
(namespace_require
|
||||||
|
(symbol)
|
||||||
|
(symbol))))
|
||||||
|
|
||||||
|
========
|
||||||
|
function
|
||||||
|
========
|
||||||
|
|
||||||
|
(defn name [params] bodyform1 bodyform2)
|
||||||
|
(defn :async [decorator1 decorator2] :tp [T1 T2] #^ annotation name [params])
|
||||||
|
(defn f [a / b [c 3] * d e #** kwargs]
|
||||||
|
[a b c d e kwargs])
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
(source_file
|
||||||
|
(function
|
||||||
|
(symbol)
|
||||||
|
(parameter_list
|
||||||
|
(symbol))
|
||||||
|
(symbol)
|
||||||
|
(symbol))
|
||||||
|
(function
|
||||||
|
(variable_list
|
||||||
|
(symbol)
|
||||||
|
(symbol))
|
||||||
|
(type_parameters
|
||||||
|
(symbol)
|
||||||
|
(symbol))
|
||||||
|
(type_annotation
|
||||||
|
(symbol))
|
||||||
|
(symbol)
|
||||||
|
(parameter_list
|
||||||
|
(symbol)))
|
||||||
|
(function
|
||||||
|
(symbol)
|
||||||
|
(parameter_list
|
||||||
|
(symbol)
|
||||||
|
(symbol)
|
||||||
|
(symbol)
|
||||||
|
(integer)
|
||||||
|
(symbol)
|
||||||
|
(symbol)
|
||||||
|
(symbol))
|
||||||
|
(list
|
||||||
|
(symbol)
|
||||||
|
(symbol)
|
||||||
|
(symbol)
|
||||||
|
(symbol)
|
||||||
|
(symbol)
|
||||||
|
(symbol))))
|
||||||
|
|
||||||
|
======
|
||||||
|
lambda
|
||||||
|
======
|
||||||
|
|
||||||
|
(fn [x] (print x))
|
||||||
|
(fn :async [x])
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
(source_file
|
||||||
|
(lambda
|
||||||
|
(parameter_list
|
||||||
|
(symbol))
|
||||||
|
(expression
|
||||||
|
(symbol)
|
||||||
|
(symbol)))
|
||||||
|
(lambda
|
||||||
|
(parameter_list
|
||||||
|
(symbol))))
|
||||||
|
|
||||||
|
=====
|
||||||
|
class
|
||||||
|
=====
|
||||||
|
|
||||||
|
(defclass [decorator1 decorator2] :tp [T1 T2] MyClass [SuperClass1 SuperClass2]
|
||||||
|
"A class that does things at times."
|
||||||
|
|
||||||
|
(setv
|
||||||
|
attribute1 value1
|
||||||
|
attribute2 value2)
|
||||||
|
|
||||||
|
(defn method1 [self arg1 arg2])
|
||||||
|
|
||||||
|
(defn method2 [self arg1 arg2]))
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
(source_file
|
||||||
|
(class
|
||||||
|
(variable_list
|
||||||
|
(symbol)
|
||||||
|
(symbol))
|
||||||
|
(type_parameters
|
||||||
|
(symbol)
|
||||||
|
(symbol))
|
||||||
|
(symbol)
|
||||||
|
(variable_list
|
||||||
|
(symbol)
|
||||||
|
(symbol))
|
||||||
|
(string)
|
||||||
|
(expression
|
||||||
|
(symbol)
|
||||||
|
(symbol)
|
||||||
|
(symbol)
|
||||||
|
(symbol)
|
||||||
|
(symbol))
|
||||||
|
(function
|
||||||
|
(symbol)
|
||||||
|
(parameter_list
|
||||||
|
(symbol)
|
||||||
|
(symbol)
|
||||||
|
(symbol)))
|
||||||
|
(function
|
||||||
|
(symbol)
|
||||||
|
(parameter_list
|
||||||
|
(symbol)
|
||||||
|
(symbol)
|
||||||
|
(symbol)))))
|
||||||
|
|
||||||
|
=====
|
||||||
|
macro
|
||||||
|
=====
|
||||||
|
|
||||||
|
(defmacro hypotenuse [a b]
|
||||||
|
(import math)
|
||||||
|
`(math.sqrt (+ (** ~a 2) (** ~b 2))))
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
(source_file
|
||||||
|
(macro
|
||||||
|
(symbol)
|
||||||
|
(parameter_list
|
||||||
|
(symbol)
|
||||||
|
(symbol))
|
||||||
|
(import
|
||||||
|
(module_import
|
||||||
|
(symbol)))
|
||||||
|
(sugar)
|
||||||
|
(expression
|
||||||
|
(dotted_identifier
|
||||||
|
(symbol)
|
||||||
|
(immediate_symbol))
|
||||||
|
(expression
|
||||||
|
(symbol)
|
||||||
|
(expression
|
||||||
|
(symbol)
|
||||||
|
(sugar)
|
||||||
|
(symbol)
|
||||||
|
(integer))
|
||||||
|
(expression
|
||||||
|
(symbol)
|
||||||
|
(sugar)
|
||||||
|
(symbol)
|
||||||
|
(integer))))))
|
||||||
|
|
||||||
|
======
|
||||||
|
reader
|
||||||
|
======
|
||||||
|
|
||||||
|
(defreader hi
|
||||||
|
'(print "Hello."))
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
(source_file
|
||||||
|
(reader
|
||||||
|
(symbol)
|
||||||
|
(sugar)
|
||||||
|
(expression
|
||||||
|
(symbol)
|
||||||
|
(string))))
|
||||||
266
test/highlight/data_augmenter.hy
Normal file
266
test/highlight/data_augmenter.hy
Normal file
|
|
@ -0,0 +1,266 @@
|
||||||
|
#!/usr/bin/env hy
|
||||||
|
|
||||||
|
(import xml.etree.ElementTree :as ET)
|
||||||
|
|
||||||
|
(require hyrule [-> doto meth ncut])
|
||||||
|
(import catboost :as cb)
|
||||||
|
(import numpy :as np)
|
||||||
|
(import pandas :as pd)
|
||||||
|
(import rdkit [Chem RDLogger])
|
||||||
|
(import tqdm [tqdm])
|
||||||
|
|
||||||
|
(import maplight-gnn)
|
||||||
|
|
||||||
|
|
||||||
|
(defclass DrugBank []
|
||||||
|
(setv namespaces {"" "http://www.drugbank.ca"})
|
||||||
|
|
||||||
|
(defmacro ap-find [element name if-found]
|
||||||
|
`(do
|
||||||
|
(setv it (.find ~element ~name self.namespaces))
|
||||||
|
(if-let it ~if-found)))
|
||||||
|
|
||||||
|
(defmacro if-let [maybe execute]
|
||||||
|
`(when (is-not ~maybe None)
|
||||||
|
~execute))
|
||||||
|
|
||||||
|
(meth __init__ [@filename @ids @id-types names]
|
||||||
|
(setv @names (.str.lower names))
|
||||||
|
(setv @get-ids {"ChEBI" @chebi
|
||||||
|
"ChEMBL" @chembl
|
||||||
|
"drugbank-id" @drugbank
|
||||||
|
"InChIKey" @inchikey
|
||||||
|
"PubChem Compound" @pubchem-compound
|
||||||
|
"PubChem Substance" @pubchem-substance
|
||||||
|
"unii" @unii}))
|
||||||
|
|
||||||
|
(meth get-matches []
|
||||||
|
(for [#(_ element) (tqdm (ET.iterparse @filename ["end"]))]
|
||||||
|
;; don't care about non-drug entries
|
||||||
|
(when (!= (cut element.tag 24 None) "drug")
|
||||||
|
(continue))
|
||||||
|
(setv matches (@check-match element))
|
||||||
|
;; make sure there are matches before doing more work
|
||||||
|
(when (not (matches.any))
|
||||||
|
(continue))
|
||||||
|
(yield #(matches element))))
|
||||||
|
|
||||||
|
(meth check-match [element]
|
||||||
|
(setv matches (pd.Series False :index @ids.index))
|
||||||
|
(for [#(id-type id-func) (.items @get-ids)]
|
||||||
|
(setv id-val (id-func element))
|
||||||
|
(when (is id-val None) (continue))
|
||||||
|
(setv id-matches (& (= @id-types id-type) (= @ids id-val)))
|
||||||
|
(setv matches (| matches id-matches)))
|
||||||
|
;; names can't use the same logic as the other id types
|
||||||
|
(setv #(generic-names brand-names) (@all-names element))
|
||||||
|
(setv matches (| matches (@names.isin generic-names)))
|
||||||
|
(setv matches (| matches (@names.isin brand-names)))
|
||||||
|
(return matches))
|
||||||
|
|
||||||
|
(meth all-names [element]
|
||||||
|
(setv generic-names (set))
|
||||||
|
(setv brand-names (set))
|
||||||
|
(setv main-name (@name element))
|
||||||
|
(when (is-not main-name None) (generic-names.add (.lower main-name)))
|
||||||
|
(ap-find element "synonyms"
|
||||||
|
(for [synonym (.iter it)]
|
||||||
|
(when (and (is-not synonym None) (is-not synonym.text None))
|
||||||
|
(generic-names.add (.lower synonym.text)))))
|
||||||
|
(ap-find element "products"
|
||||||
|
(for [product (.iter it)]
|
||||||
|
(setv brand-name (product.find "name" @namespaces))
|
||||||
|
(if-let brand-name (brand-names.add (.lower brand-name.text)))))
|
||||||
|
(setv generic-names (tuple (filter (fn [s] (not-in "\n" s)) generic-names)))
|
||||||
|
(setv brand-names (tuple (filter (fn [s] (not-in "\n" s)) brand-names)))
|
||||||
|
(return #(generic-names brand-names)))
|
||||||
|
|
||||||
|
(meth cas-number [element]
|
||||||
|
(ap-find element "cas-number" it.text))
|
||||||
|
|
||||||
|
(meth chebi [element]
|
||||||
|
(@from-external-identifiers element "ChEBI"))
|
||||||
|
|
||||||
|
(meth chembl [element]
|
||||||
|
(@from-external-identifiers element "ChEMBL"))
|
||||||
|
|
||||||
|
(meth drugbank [element]
|
||||||
|
(ap-find element "drugbank-id" it.text))
|
||||||
|
|
||||||
|
(meth fda-approval [element]
|
||||||
|
(ap-find element "groups" (in "approved" (tuple (it.itertext)))))
|
||||||
|
|
||||||
|
(meth inchikey [element]
|
||||||
|
(@from-calculated-properties element "InChIKey"))
|
||||||
|
|
||||||
|
(meth indication [element]
|
||||||
|
(ap-find element "indication" it.text))
|
||||||
|
|
||||||
|
(meth mechanism [element]
|
||||||
|
(ap-find element "mechanism-of-action" it.text))
|
||||||
|
|
||||||
|
(meth name [element]
|
||||||
|
(ap-find element "name" it.text))
|
||||||
|
|
||||||
|
(meth prices [element]
|
||||||
|
(ap-find element "prices"
|
||||||
|
(do
|
||||||
|
(setv prices (list))
|
||||||
|
(for [price-element (it.iterfind "price" @namespaces)]
|
||||||
|
(setv price (price-element.find "cost" @namespaces))
|
||||||
|
(if-let price (.append prices (+ price.text (price.attrib.get "currency")))))
|
||||||
|
(return prices))))
|
||||||
|
|
||||||
|
(meth pubchem-compound [element]
|
||||||
|
(@from-external-identifiers element "PubChem Compound"))
|
||||||
|
|
||||||
|
(meth pubchem-substance [element]
|
||||||
|
(@from-external-identifiers element "PubChem Substance"))
|
||||||
|
|
||||||
|
(meth smiles [element]
|
||||||
|
(@from-calculated-properties element "SMILES"))
|
||||||
|
|
||||||
|
(meth unii [element]
|
||||||
|
(ap-find element "unii" it.text))
|
||||||
|
|
||||||
|
(meth from-external-identifiers [element resource-type]
|
||||||
|
(ap-find element "external-identifiers"
|
||||||
|
(for [external-identifier (it.iterfind "external-identifier" @namespaces)]
|
||||||
|
(when (= (external-identifier.findtext "resource" :namespaces @namespaces) resource-type)
|
||||||
|
(return (external-identifier.findtext "identifier" :namespaces @namespaces))))))
|
||||||
|
|
||||||
|
(meth from-calculated-properties [element kind-type]
|
||||||
|
(ap-find element "calculated-properties"
|
||||||
|
(for [property (it.iterfind "property" @namespaces)]
|
||||||
|
(when (= (property.findtext "kind" :namespaces @namespaces) kind-type)
|
||||||
|
(return (property.findtext "value" :namespaces @namespaces)))))))
|
||||||
|
|
||||||
|
|
||||||
|
(defclass DataAugmenter []
|
||||||
|
(defmacro create-var-column [var-name col-name col-initial-value]
|
||||||
|
`(do
|
||||||
|
(setv ~var-name ~col-name)
|
||||||
|
(setv (get self.drug-list ~var-name) ~col-initial-value)))
|
||||||
|
|
||||||
|
(meth __init__ [@filename]
|
||||||
|
(setv @drug-list None)
|
||||||
|
(setv @admet-models None))
|
||||||
|
|
||||||
|
(meth load-drug-queries []
|
||||||
|
(cond
|
||||||
|
(@filename.endswith ".csv")
|
||||||
|
(with [f (open @filename "r")]
|
||||||
|
(setv @drug-list (pd.read-csv f)))
|
||||||
|
(@filename.endswith ".json")
|
||||||
|
(with [f (open @filename "r")]
|
||||||
|
(setv @drug-list (pd.read-json f :orient "records")))
|
||||||
|
True
|
||||||
|
(raise (ValueError "Data file must be .csv or .json")))
|
||||||
|
(return self))
|
||||||
|
|
||||||
|
(meth load-admet-models [models]
|
||||||
|
(setv @admet-models (dict))
|
||||||
|
(for [#(name path) (models.items)]
|
||||||
|
(setv model (cb.CatBoostClassifier))
|
||||||
|
(model.load-model path)
|
||||||
|
(setv (get @admet-models name) model))
|
||||||
|
(return self))
|
||||||
|
|
||||||
|
(meth save-drug-info [filename]
|
||||||
|
(when (is @drug-list None)
|
||||||
|
(raise (ValueError "drug-list must be loaded first.")))
|
||||||
|
(with [f (open filename "w")]
|
||||||
|
(@drug-list.to-json f :orient "records")))
|
||||||
|
|
||||||
|
(meth match-drugbank [filename id-col-name id-type-col-name name-col-name]
|
||||||
|
(when (is @drug-list None)
|
||||||
|
(raise (ValueError "drug-list is not defined. Call load-drug-queries before match-drugbank.")))
|
||||||
|
;; make sure the cols are strings and not lists of strings
|
||||||
|
(setv unwrap-list (fn [x] (if (isinstance x list) (get x 0) x)))
|
||||||
|
(setv id-col (.apply (get @drug-list id-col-name) unwrap-list))
|
||||||
|
(setv id-type-col (.apply (get @drug-list id-type-col-name) unwrap-list))
|
||||||
|
(setv name-col (.apply (get @drug-list name-col-name) unwrap-list))
|
||||||
|
;; tedious column making for what we're about to store
|
||||||
|
;; variable name, column title, initial value
|
||||||
|
(create-var-column cas-column "CAS Registry Number" None)
|
||||||
|
(create-var-column fda-column "FDA Approved" None)
|
||||||
|
(create-var-column indication-column "Indication" None)
|
||||||
|
(create-var-column mechanism-column "Mechanism" None)
|
||||||
|
(create-var-column name-column "DrugBank Name" None)
|
||||||
|
(create-var-column price-column "Prices" (@drug-list.apply (fn [_] (list)) :axis 1))
|
||||||
|
(create-var-column smiles-column "SMILES" None)
|
||||||
|
(create-var-column unii-column "UNII" None)
|
||||||
|
(setv drugbank (DrugBank filename id-col id-type-col name-col))
|
||||||
|
(for [#(matches element) (drugbank.get-matches)]
|
||||||
|
(setv (ncut @drug-list.loc matches cas-column) (drugbank.cas-number element))
|
||||||
|
(setv (ncut @drug-list.loc matches fda-column) (drugbank.fda-approval element))
|
||||||
|
(setv (ncut @drug-list.loc matches indication-column) (drugbank.indication element))
|
||||||
|
(setv (ncut @drug-list.loc matches mechanism-column) (drugbank.mechanism element))
|
||||||
|
(setv (ncut @drug-list.loc matches name-column) (drugbank.name element))
|
||||||
|
(setv (ncut @drug-list.loc matches price-column)
|
||||||
|
(.apply (ncut @drug-list.loc matches price-column) (fn [_] (drugbank.prices element)))) ; prices is a list
|
||||||
|
(setv (ncut @drug-list.loc matches smiles-column) (drugbank.smiles element))
|
||||||
|
(setv (ncut @drug-list.loc matches unii-column) (drugbank.unii element))))
|
||||||
|
|
||||||
|
(meth deduplicate []
|
||||||
|
(when (is @drug-list None)
|
||||||
|
(raise (ValueError "drug-list is not defined. Call load-drug-queries before deduplicate.")))
|
||||||
|
(when (not-in "DrugBank Name" @drug-list.columns)
|
||||||
|
(raise (ValueError "ID data does not exist yet. Run match-drugbank to create it.")))
|
||||||
|
(setv @drug-list
|
||||||
|
(-> @drug-list
|
||||||
|
(.groupby "DrugBank Name")
|
||||||
|
(.agg
|
||||||
|
(fn [x]
|
||||||
|
(setv y [])
|
||||||
|
(for [item x]
|
||||||
|
(if (isinstance item list)
|
||||||
|
(y.extend item)
|
||||||
|
(y.append item)))
|
||||||
|
(setv z (set y))
|
||||||
|
(z.discard None)
|
||||||
|
(cond
|
||||||
|
(= (len z) 0) None
|
||||||
|
(= (len z) 1) (.pop z)
|
||||||
|
True z)))
|
||||||
|
(.reset-index))))
|
||||||
|
|
||||||
|
(meth predict-admet []
|
||||||
|
(when (is @drug-list None)
|
||||||
|
(raise (ValueError "drug-list is not defined. Call load-drug-queries before predict-admet.")))
|
||||||
|
(when (is @admet-models None)
|
||||||
|
(raise (ValueError "admet-models is not defined. Call load-admet-models before predict-admet.")))
|
||||||
|
(when (not-in "SMILES" @drug-list.columns)
|
||||||
|
(raise (ValueError "SMILES data does not exist yet. Run match-drugbank to create it.")))
|
||||||
|
(RDLogger.DisableLog "rdApp.*")
|
||||||
|
(setv smiles-mask (.notna (get @drug-list "SMILES")))
|
||||||
|
(setv smiles (ncut @drug-list.loc smiles-mask "SMILES"))
|
||||||
|
(setv molecules (smiles.apply Chem.MolFromSmiles))
|
||||||
|
(setv molecules-mask (.notna molecules))
|
||||||
|
(setv fingerprints (@get-fingerprints (get molecules molecules-mask)))
|
||||||
|
(setv combined-mask (pd.Series False :index @drug-list.index))
|
||||||
|
(setv (ncut combined-mask.loc (. (get smiles molecules-mask) index)) True)
|
||||||
|
(for [#(name model) (@admet-models.items)]
|
||||||
|
(setv predictions (model.predict-proba fingerprints))
|
||||||
|
(setv (ncut @drug-list.loc combined-mask name) (ncut predictions : 1))))
|
||||||
|
|
||||||
|
(meth get-fingerprints [molecules]
|
||||||
|
(setv fingerprints (list))
|
||||||
|
(fingerprints.append (maplight-gnn.get-morgan-fingerprints molecules))
|
||||||
|
(fingerprints.append (maplight-gnn.get-avalon-fingerprints molecules))
|
||||||
|
(fingerprints.append (maplight-gnn.get-erg-fingerprints molecules))
|
||||||
|
(fingerprints.append (maplight-gnn.get-rdkit-features molecules))
|
||||||
|
(fingerprints.append (maplight-gnn.get-gin-supervised-masking molecules))
|
||||||
|
(np.concatenate fingerprints :axis 1)))
|
||||||
|
|
||||||
|
|
||||||
|
(when (= __name__ "__main__")
|
||||||
|
(setv augmenter
|
||||||
|
(-> (DataAugmenter "data/translator_drugs.json")
|
||||||
|
(.load-drug-queries)
|
||||||
|
(.load-admet-models {"Blood Brain Barrier" "data/admet/bbb_martins-0.916-0.002.dump" "Bioavailability" "data/admet/bioavailability_ma-0.74-0.01.dump" "Human Intestinal Absorption" "data/admet/hia_hou-0.989-0.001.dump"})))
|
||||||
|
(doto augmenter
|
||||||
|
(.match-drugbank "data/src/drugbank.xml" "result_id" "id_type" "result_name")
|
||||||
|
(.deduplicate)
|
||||||
|
(.predict-admet)
|
||||||
|
(.save-drug-info "data/translator_drug_list.json")))
|
||||||
347
test/highlight/data_augmenter.py
Normal file
347
test/highlight/data_augmenter.py
Normal file
|
|
@ -0,0 +1,347 @@
|
||||||
|
import hy
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
hy.macros.require('hyrule', None, target_module_name='data_augmenter', assignments=[['->', '->'], ['doto', 'doto'], ['meth', 'meth'], ['ncut', 'ncut']], prefix='')
|
||||||
|
import catboost as cb
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from rdkit import Chem, RDLogger
|
||||||
|
from tqdm import tqdm
|
||||||
|
import maplight_gnn
|
||||||
|
|
||||||
|
class DrugBank:
|
||||||
|
namespaces = {'': 'http://www.drugbank.ca'}
|
||||||
|
_hy_local_macro__ap_find = lambda element, name, if_found: hy.models.Expression([hy.models.Symbol('do', from_parser=True), hy.models.Expression([hy.models.Symbol('setv', from_parser=True), hy.models.Symbol('it', from_parser=True), hy.models.Expression([hy.models.Expression([hy.models.Symbol('.', from_parser=True), hy.models.Symbol('None', from_parser=True), hy.models.Symbol('find', from_parser=True)]), element, name, hy.models.Expression([hy.models.Symbol('.', from_parser=True), hy.models.Symbol('self', from_parser=True), hy.models.Symbol('namespaces', from_parser=True)])])]), hy.models.Expression([hy.models.Symbol('if-let', from_parser=True), hy.models.Symbol('it', from_parser=True), if_found])])
|
||||||
|
_hy_local_macro__if_let = lambda maybe, execute: hy.models.Expression([hy.models.Symbol('when', from_parser=True), hy.models.Expression([hy.models.Symbol('is-not', from_parser=True), maybe, hy.models.Symbol('None', from_parser=True)]), execute])
|
||||||
|
|
||||||
|
def __init__(self, filename, ids, id_types, names):
|
||||||
|
None
|
||||||
|
self.filename = filename
|
||||||
|
self.ids = ids
|
||||||
|
self.id_types = id_types
|
||||||
|
self.names = names.str.lower()
|
||||||
|
self.get_ids = {'ChEBI': self.chebi, 'ChEMBL': self.chembl, 'drugbank-id': self.drugbank, 'InChIKey': self.inchikey, 'PubChem Compound': self.pubchem_compound, 'PubChem Substance': self.pubchem_substance, 'unii': self.unii}
|
||||||
|
|
||||||
|
def get_matches(self):
|
||||||
|
None
|
||||||
|
for (_, element) in tqdm(ET.iterparse(self.filename, ['end'])):
|
||||||
|
if element.tag[24:None:None] != 'drug':
|
||||||
|
continue
|
||||||
|
_hy_anon_var_1 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_1 = None
|
||||||
|
matches = self.check_match(element)
|
||||||
|
if not matches.any():
|
||||||
|
continue
|
||||||
|
_hy_anon_var_2 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_2 = None
|
||||||
|
yield (matches, element)
|
||||||
|
|
||||||
|
def check_match(self, element):
|
||||||
|
None
|
||||||
|
matches = pd.Series(False, index=self.ids.index)
|
||||||
|
for (id_type, id_func) in self.get_ids.items():
|
||||||
|
id_val = id_func(element)
|
||||||
|
if id_val is None:
|
||||||
|
continue
|
||||||
|
_hy_anon_var_3 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_3 = None
|
||||||
|
id_matches = (self.id_types == id_type) & (self.ids == id_val)
|
||||||
|
matches = matches | id_matches
|
||||||
|
(generic_names, brand_names) = self.all_names(element)
|
||||||
|
matches = matches | self.names.isin(generic_names)
|
||||||
|
matches = matches | self.names.isin(brand_names)
|
||||||
|
return matches
|
||||||
|
|
||||||
|
def all_names(self, element):
|
||||||
|
None
|
||||||
|
generic_names = set()
|
||||||
|
brand_names = set()
|
||||||
|
main_name = self.name(element)
|
||||||
|
generic_names.add(main_name.lower()) if main_name is not None else None
|
||||||
|
it = element.find('synonyms', self.namespaces)
|
||||||
|
if it is not None:
|
||||||
|
for synonym in it.iter():
|
||||||
|
generic_names.add(synonym.text.lower()) if synonym is not None and synonym.text is not None else None
|
||||||
|
_hy_anon_var_4 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_4 = None
|
||||||
|
it = element.find('products', self.namespaces)
|
||||||
|
if it is not None:
|
||||||
|
for product in it.iter():
|
||||||
|
brand_name = product.find('name', self.namespaces)
|
||||||
|
brand_names.add(brand_name.text.lower()) if brand_name is not None else None
|
||||||
|
_hy_anon_var_5 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_5 = None
|
||||||
|
generic_names = tuple(filter(lambda s: '\n' not in s, generic_names))
|
||||||
|
brand_names = tuple(filter(lambda s: '\n' not in s, brand_names))
|
||||||
|
return (generic_names, brand_names)
|
||||||
|
|
||||||
|
def cas_number(self, element):
|
||||||
|
None
|
||||||
|
it = element.find('cas-number', self.namespaces)
|
||||||
|
return it.text if it is not None else None
|
||||||
|
|
||||||
|
def chebi(self, element):
|
||||||
|
None
|
||||||
|
return self.from_external_identifiers(element, 'ChEBI')
|
||||||
|
|
||||||
|
def chembl(self, element):
|
||||||
|
None
|
||||||
|
return self.from_external_identifiers(element, 'ChEMBL')
|
||||||
|
|
||||||
|
def drugbank(self, element):
|
||||||
|
None
|
||||||
|
it = element.find('drugbank-id', self.namespaces)
|
||||||
|
return it.text if it is not None else None
|
||||||
|
|
||||||
|
def fda_approval(self, element):
|
||||||
|
None
|
||||||
|
it = element.find('groups', self.namespaces)
|
||||||
|
return 'approved' in tuple(it.itertext()) if it is not None else None
|
||||||
|
|
||||||
|
def inchikey(self, element):
|
||||||
|
None
|
||||||
|
return self.from_calculated_properties(element, 'InChIKey')
|
||||||
|
|
||||||
|
def indication(self, element):
|
||||||
|
None
|
||||||
|
it = element.find('indication', self.namespaces)
|
||||||
|
return it.text if it is not None else None
|
||||||
|
|
||||||
|
def mechanism(self, element):
|
||||||
|
None
|
||||||
|
it = element.find('mechanism-of-action', self.namespaces)
|
||||||
|
return it.text if it is not None else None
|
||||||
|
|
||||||
|
def name(self, element):
|
||||||
|
None
|
||||||
|
it = element.find('name', self.namespaces)
|
||||||
|
return it.text if it is not None else None
|
||||||
|
|
||||||
|
def prices(self, element):
|
||||||
|
None
|
||||||
|
it = element.find('prices', self.namespaces)
|
||||||
|
if it is not None:
|
||||||
|
prices = list()
|
||||||
|
for price_element in it.iterfind('price', self.namespaces):
|
||||||
|
price = price_element.find('cost', self.namespaces)
|
||||||
|
prices.append(price.text + price.attrib.get('currency')) if price is not None else None
|
||||||
|
return prices
|
||||||
|
_hy_anon_var_6 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_6 = None
|
||||||
|
return _hy_anon_var_6
|
||||||
|
|
||||||
|
def pubchem_compound(self, element):
|
||||||
|
None
|
||||||
|
return self.from_external_identifiers(element, 'PubChem Compound')
|
||||||
|
|
||||||
|
def pubchem_substance(self, element):
|
||||||
|
None
|
||||||
|
return self.from_external_identifiers(element, 'PubChem Substance')
|
||||||
|
|
||||||
|
def smiles(self, element):
|
||||||
|
None
|
||||||
|
return self.from_calculated_properties(element, 'SMILES')
|
||||||
|
|
||||||
|
def unii(self, element):
|
||||||
|
None
|
||||||
|
it = element.find('unii', self.namespaces)
|
||||||
|
return it.text if it is not None else None
|
||||||
|
|
||||||
|
def from_external_identifiers(self, element, resource_type):
|
||||||
|
None
|
||||||
|
it = element.find('external-identifiers', self.namespaces)
|
||||||
|
if it is not None:
|
||||||
|
for external_identifier in it.iterfind('external-identifier', self.namespaces):
|
||||||
|
if external_identifier.findtext('resource', namespaces=self.namespaces) == resource_type:
|
||||||
|
return external_identifier.findtext('identifier', namespaces=self.namespaces)
|
||||||
|
_hy_anon_var_7 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_7 = None
|
||||||
|
_hy_anon_var_8 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_8 = None
|
||||||
|
return _hy_anon_var_8
|
||||||
|
|
||||||
|
def from_calculated_properties(self, element, kind_type):
|
||||||
|
None
|
||||||
|
it = element.find('calculated-properties', self.namespaces)
|
||||||
|
if it is not None:
|
||||||
|
for property in it.iterfind('property', self.namespaces):
|
||||||
|
if property.findtext('kind', namespaces=self.namespaces) == kind_type:
|
||||||
|
return property.findtext('value', namespaces=self.namespaces)
|
||||||
|
_hy_anon_var_9 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_9 = None
|
||||||
|
_hy_anon_var_10 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_10 = None
|
||||||
|
return _hy_anon_var_10
|
||||||
|
|
||||||
|
class DataAugmenter:
|
||||||
|
_hy_local_macro__create_var_column = lambda var_name, col_name, col_initial_value: hy.models.Expression([hy.models.Symbol('do', from_parser=True), hy.models.Expression([hy.models.Symbol('setv', from_parser=True), var_name, col_name]), hy.models.Expression([hy.models.Symbol('setv', from_parser=True), hy.models.Expression([hy.models.Symbol('get', from_parser=True), hy.models.Expression([hy.models.Symbol('.', from_parser=True), hy.models.Symbol('self', from_parser=True), hy.models.Symbol('drug-list', from_parser=True)]), var_name]), col_initial_value])])
|
||||||
|
|
||||||
|
def __init__(self, filename):
|
||||||
|
None
|
||||||
|
self.filename = filename
|
||||||
|
self.drug_list = None
|
||||||
|
self.admet_models = None
|
||||||
|
|
||||||
|
def load_drug_queries(self):
|
||||||
|
None
|
||||||
|
if self.filename.endswith('.csv'):
|
||||||
|
_hy_anon_var_11 = None
|
||||||
|
with open(self.filename, 'r') as f:
|
||||||
|
self.drug_list = pd.read_csv(f)
|
||||||
|
_hy_anon_var_11 = None
|
||||||
|
_hy_anon_var_15 = _hy_anon_var_11
|
||||||
|
else:
|
||||||
|
if self.filename.endswith('.json'):
|
||||||
|
_hy_anon_var_12 = None
|
||||||
|
with open(self.filename, 'r') as f:
|
||||||
|
self.drug_list = pd.read_json(f, orient='records')
|
||||||
|
_hy_anon_var_12 = None
|
||||||
|
_hy_anon_var_14 = _hy_anon_var_12
|
||||||
|
else:
|
||||||
|
if True:
|
||||||
|
raise ValueError('Data file must be .csv or .json')
|
||||||
|
_hy_anon_var_13 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_13 = None
|
||||||
|
_hy_anon_var_14 = _hy_anon_var_13
|
||||||
|
_hy_anon_var_15 = _hy_anon_var_14
|
||||||
|
return self
|
||||||
|
|
||||||
|
def load_admet_models(self, models):
|
||||||
|
None
|
||||||
|
self.admet_models = dict()
|
||||||
|
for (name, path) in models.items():
|
||||||
|
model = cb.CatBoostClassifier()
|
||||||
|
model.load_model(path)
|
||||||
|
self.admet_models[name] = model
|
||||||
|
return self
|
||||||
|
|
||||||
|
def save_drug_info(self, filename):
|
||||||
|
None
|
||||||
|
if self.drug_list is None:
|
||||||
|
raise ValueError('drug-list must be loaded first.')
|
||||||
|
_hy_anon_var_16 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_16 = None
|
||||||
|
_hy_anon_var_17 = None
|
||||||
|
with open(filename, 'w') as f:
|
||||||
|
_hy_anon_var_17 = self.drug_list.to_json(f, orient='records')
|
||||||
|
return _hy_anon_var_17
|
||||||
|
|
||||||
|
def match_drugbank(self, filename, id_col_name, id_type_col_name, name_col_name):
|
||||||
|
None
|
||||||
|
if self.drug_list is None:
|
||||||
|
raise ValueError('drug-list is not defined. Call load-drug-queries before match-drugbank.')
|
||||||
|
_hy_anon_var_18 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_18 = None
|
||||||
|
unwrap_list = lambda x: x[0] if isinstance(x, list) else x
|
||||||
|
id_col = self.drug_list[id_col_name].apply(unwrap_list)
|
||||||
|
id_type_col = self.drug_list[id_type_col_name].apply(unwrap_list)
|
||||||
|
name_col = self.drug_list[name_col_name].apply(unwrap_list)
|
||||||
|
cas_column = 'CAS Registry Number'
|
||||||
|
self.drug_list[cas_column] = None
|
||||||
|
fda_column = 'FDA Approved'
|
||||||
|
self.drug_list[fda_column] = None
|
||||||
|
indication_column = 'Indication'
|
||||||
|
self.drug_list[indication_column] = None
|
||||||
|
mechanism_column = 'Mechanism'
|
||||||
|
self.drug_list[mechanism_column] = None
|
||||||
|
name_column = 'DrugBank Name'
|
||||||
|
self.drug_list[name_column] = None
|
||||||
|
price_column = 'Prices'
|
||||||
|
self.drug_list[price_column] = self.drug_list.apply(lambda _: list(), axis=1)
|
||||||
|
smiles_column = 'SMILES'
|
||||||
|
self.drug_list[smiles_column] = None
|
||||||
|
unii_column = 'UNII'
|
||||||
|
self.drug_list[unii_column] = None
|
||||||
|
drugbank = DrugBank(filename, id_col, id_type_col, name_col)
|
||||||
|
for (matches, element) in drugbank.get_matches():
|
||||||
|
self.drug_list.loc[matches, cas_column] = drugbank.cas_number(element)
|
||||||
|
self.drug_list.loc[matches, fda_column] = drugbank.fda_approval(element)
|
||||||
|
self.drug_list.loc[matches, indication_column] = drugbank.indication(element)
|
||||||
|
self.drug_list.loc[matches, mechanism_column] = drugbank.mechanism(element)
|
||||||
|
self.drug_list.loc[matches, name_column] = drugbank.name(element)
|
||||||
|
self.drug_list.loc[matches, price_column] = self.drug_list.loc[matches, price_column].apply(lambda _: drugbank.prices(element))
|
||||||
|
self.drug_list.loc[matches, smiles_column] = drugbank.smiles(element)
|
||||||
|
self.drug_list.loc[matches, unii_column] = drugbank.unii(element)
|
||||||
|
|
||||||
|
def deduplicate(self):
|
||||||
|
None
|
||||||
|
if self.drug_list is None:
|
||||||
|
raise ValueError('drug-list is not defined. Call load-drug-queries before deduplicate.')
|
||||||
|
_hy_anon_var_19 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_19 = None
|
||||||
|
if 'DrugBank Name' not in self.drug_list.columns:
|
||||||
|
raise ValueError('ID data does not exist yet. Run match-drugbank to create it.')
|
||||||
|
_hy_anon_var_20 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_20 = None
|
||||||
|
|
||||||
|
def _hy_anon_var_21(x):
|
||||||
|
y = []
|
||||||
|
for item in x:
|
||||||
|
y.extend(item) if isinstance(item, list) else y.append(item)
|
||||||
|
z = set(y)
|
||||||
|
z.discard(None)
|
||||||
|
return None if len(z) == 0 else z.pop() if len(z) == 1 else z if True else None
|
||||||
|
self.drug_list = self.drug_list.groupby('DrugBank Name').agg(_hy_anon_var_21).reset_index()
|
||||||
|
|
||||||
|
def predict_admet(self):
|
||||||
|
None
|
||||||
|
if self.drug_list is None:
|
||||||
|
raise ValueError('drug-list is not defined. Call load-drug-queries before predict-admet.')
|
||||||
|
_hy_anon_var_22 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_22 = None
|
||||||
|
if self.admet_models is None:
|
||||||
|
raise ValueError('admet-models is not defined. Call load-admet-models before predict-admet.')
|
||||||
|
_hy_anon_var_23 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_23 = None
|
||||||
|
if 'SMILES' not in self.drug_list.columns:
|
||||||
|
raise ValueError('SMILES data does not exist yet. Run match-drugbank to create it.')
|
||||||
|
_hy_anon_var_24 = None
|
||||||
|
else:
|
||||||
|
_hy_anon_var_24 = None
|
||||||
|
RDLogger.DisableLog('rdApp.*')
|
||||||
|
smiles_mask = self.drug_list['SMILES'].notna()
|
||||||
|
smiles = self.drug_list.loc[smiles_mask, 'SMILES']
|
||||||
|
molecules = smiles.apply(Chem.MolFromSmiles)
|
||||||
|
molecules_mask = molecules.notna()
|
||||||
|
fingerprints = self.get_fingerprints(molecules[molecules_mask])
|
||||||
|
combined_mask = pd.Series(False, index=self.drug_list.index)
|
||||||
|
combined_mask.loc[smiles[molecules_mask].index] = True
|
||||||
|
for (name, model) in self.admet_models.items():
|
||||||
|
predictions = model.predict_proba(fingerprints)
|
||||||
|
self.drug_list.loc[combined_mask, name] = predictions[slice(None, None), 1]
|
||||||
|
|
||||||
|
def get_fingerprints(self, molecules):
|
||||||
|
None
|
||||||
|
fingerprints = list()
|
||||||
|
fingerprints.append(maplight_gnn.get_morgan_fingerprints(molecules))
|
||||||
|
fingerprints.append(maplight_gnn.get_avalon_fingerprints(molecules))
|
||||||
|
fingerprints.append(maplight_gnn.get_erg_fingerprints(molecules))
|
||||||
|
fingerprints.append(maplight_gnn.get_rdkit_features(molecules))
|
||||||
|
fingerprints.append(maplight_gnn.get_gin_supervised_masking(molecules))
|
||||||
|
return np.concatenate(fingerprints, axis=1)
|
||||||
|
if __name__ == '__main__':
|
||||||
|
augmenter = DataAugmenter('data/translator_drugs.json').load_drug_queries().load_admet_models({'Blood Brain Barrier': 'data/admet/bbb_martins-0.916-0.002.dump', 'Bioavailability': 'data/admet/bioavailability_ma-0.74-0.01.dump', 'Human Intestinal Absorption': 'data/admet/hia_hou-0.989-0.001.dump'})
|
||||||
|
_hy_gensym_f_1 = augmenter
|
||||||
|
_hy_gensym_f_1.match_drugbank('data/src/drugbank.xml', 'result_id', 'id_type', 'result_name')
|
||||||
|
_hy_gensym_f_1.deduplicate()
|
||||||
|
_hy_gensym_f_1.predict_admet()
|
||||||
|
_hy_gensym_f_1.save_drug_info('data/translator_drug_list.json')
|
||||||
|
_hy_anon_var_25 = _hy_gensym_f_1
|
||||||
|
else:
|
||||||
|
_hy_anon_var_25 = None
|
||||||
Loading…
Add table
Add a link
Reference in a new issue