2019
Althubaiti, Sara; Karwath, Andreas; Dallol, Ashraf; Noor, Adeeb; Alkhayyat, Shadi Salem; Alwassia, Rolina; Mineta, Katsuhiko; Gojobori, Takashi; Beggs, Andrew D; Schofield, Paul N; Gkoutos, Georgios V; Hoehndorf, Robert
Ontology-based prediction of cancer driver genes Journal Article
In: Scientific Reports, vol. 9, no. 1, pp. 17405, 2019, ISSN: 2045-2322.
Links | BibTeX | Tags: bioinformatics, cancer, health data science
@article{RN16,
title = {Ontology-based prediction of cancer driver genes},
author = {Sara Althubaiti and Andreas Karwath and Ashraf Dallol and Adeeb Noor and Shadi Salem Alkhayyat and Rolina Alwassia and Katsuhiko Mineta and Takashi Gojobori and Andrew D Beggs and Paul N Schofield and Georgios V Gkoutos and Robert Hoehndorf},
url = {https://doi.org/10.1038/s41598-019-53454-1},
doi = {10.1038/s41598-019-53454-1},
issn = {2045-2322},
year = {2019},
date = {2019-01-01},
urldate = {2019-01-01},
journal = {Scientific Reports},
volume = {9},
number = {1},
pages = {17405},
keywords = {bioinformatics, cancer, health data science},
pubstate = {published},
tppubtype = {article}
}
2007
Karwath, Andreas; Kersting, Kristian
Relational Sequence Alignments and Logos Conference
Inductive Logic Programming, 16th International Conference, ILP 2006, vol. 4455, Lecture Notes in Computer Science Springer-Verlag Berlin Heidelberg Springer Verlag, Berlin Heidelberg, Germany, 2007, ISBN: 978-3-540-73846-6.
Abstract | Links | BibTeX | Tags: bioinformatics, inductive logic programming, relational learning, scientific knowledge
@conference{karwath2007,
title = {Relational Sequence Alignments and Logos},
author = {Andreas Karwath and Kristian Kersting},
url = {http://dx.doi.org/10.1007/978-3-540-73847-3_29},
doi = {10.1007/978-3-540-73847-3_29},
isbn = {978-3-540-73846-6},
year = {2007},
date = {2007-01-01},
booktitle = {Inductive Logic Programming, 16th International Conference, ILP 2006},
volume = {4455},
pages = {290-304},
publisher = {Springer Verlag},
address = {Berlin Heidelberg, Germany},
organization = {Springer-Verlag Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
crossref = {DBLP:conf/ilp/2006},
abstract = {The need to measure sequence similarity arises in many applicitation domains and often coincides with sequence alignment: the more similar two sequences are, the better they can be aligned. Aligning sequences not only shows how similar sequences are, it also shows where there are differences and correspondences between the sequences.
Traditionally, the alignment has been considered for sequences of flat symbols only. Many real world sequences such as natural language sentences and protein secondary structures, however, exhibit rich internal structures. This is akin to the problem of dealing with structured examples studied in the field of inductive logic programming (ILP). In this paper, we introduce Real, which is a powerful, yet simple approach to align sequence of structured symbols using well-established ILP distance measures within traditional alignment methods. Although straight-forward, experiments on protein data and Medline abstracts show that this approach works well in practice, that the resulting alignments can indeed provide more information than flat ones, and that they are meaningful to experts when represented graphically.},
keywords = {bioinformatics, inductive logic programming, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {conference}
}
Traditionally, the alignment has been considered for sequences of flat symbols only. Many real world sequences such as natural language sentences and protein secondary structures, however, exhibit rich internal structures. This is akin to the problem of dealing with structured examples studied in the field of inductive logic programming (ILP). In this paper, we introduce Real, which is a powerful, yet simple approach to align sequence of structured symbols using well-established ILP distance measures within traditional alignment methods. Although straight-forward, experiments on protein data and Medline abstracts show that this approach works well in practice, that the resulting alignments can indeed provide more information than flat ones, and that they are meaningful to experts when represented graphically.
King, Ross D.; Karwath, Andreas; Clare, Amanda; Dehaspe, Luc
Logic and the Automatic Acquisition of Scientific Knowledge: An Application to Functional Genomics Conference
Computational Discovery of Scientific Knowledge, Introduction, Techniques, and Applications in Environmental and Life Sciences, vol. 4660, Lecture Notes in Computer Science Springer-Verlag Berlin Heidelberg Springer Verlag, Berlin Heidelberg, Germany, 2007, ISBN: 978-3-540-73919-7.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge
@conference{king2007,
title = {Logic and the Automatic Acquisition of Scientific Knowledge: An Application to Functional Genomics},
author = {Ross D. King and Andreas Karwath and Amanda Clare and Luc Dehaspe},
url = {http://dx.doi.org/10.1007/978-3-540-73920-3_13},
doi = {10.1007/978-3-540-73920-3_13},
isbn = {978-3-540-73919-7},
year = {2007},
date = {2007-01-01},
booktitle = {Computational Discovery of Scientific Knowledge, Introduction, Techniques, and Applications in Environmental and Life Sciences},
volume = {4660},
pages = {273-289},
publisher = {Springer Verlag},
address = {Berlin Heidelberg, Germany},
organization = {Springer-Verlag Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
crossref = {DBLP:conf/dis/2007book},
abstract = {This paper is a manifesto aimed at computer scientists interested in developing and applying scientific discovery methods. It argues that: science is experiencing an unprecedented “explosion” in the amount of available data; traditional data analysis methods cannot deal with this increased quantity of data; there is an urgent need to automate the process of refining scientific data into scientific knowledge; inductive logic programming (ILP) is a data analysis framework well suited for this task; and exciting new scientific discoveries can be achieved using ILP scientific discovery methods. We describe an example of using ILP to analyse a large and complex bioinformatic database that has produced unexpected and interesting scientific results in functional genomics. We then point a possible way forward to integrating machine learning with scientific databases to form intelligent databases.},
keywords = {bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {conference}
}
2006
Karwath, Andreas; Kersting, Kristian
Relational Sequence Alignments Conference
Proc. The 4th International Workshop on Mining and Learning with Graphs, MLG 2006, % editor = Thomas Gärtner and Gemma C. Garriga and Thorsten Meinl, % month = September, 2006, (workshop).
BibTeX | Tags: bioinformatics, cheminformatics, relational learning, scientific knowledge
@conference{karwath06b,
title = {Relational Sequence Alignments},
author = {Andreas Karwath and Kristian Kersting},
year = {2006},
date = {2006-01-01},
booktitle = {Proc. The 4th International Workshop on Mining and Learning with Graphs, MLG 2006, % editor = Thomas Gärtner and Gemma C. Garriga and Thorsten Meinl, % month = September},
pages = {149-156},
note = {workshop},
keywords = {bioinformatics, cheminformatics, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {conference}
}
Clare, Amanda; Karwath, Andreas; Ougham, Helen; King, Ross D.
Functional bioinformatics for Arabidopsis thaliana Journal Article
In: Bioinformatics, vol. 22, no. 9, pp. 1130-1136, 2006.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge
@article{karwath06a,
title = {Functional bioinformatics for Arabidopsis thaliana},
author = {Amanda Clare and Andreas Karwath and Helen Ougham and Ross D. King},
url = {https://bioinformatics.oxfordjournals.org/content/22/9/1130.full.pdf+html},
doi = {10.1093/bioinformatics/btl051},
year = {2006},
date = {2006-01-01},
journal = {Bioinformatics},
volume = {22},
number = {9},
pages = {1130-1136},
abstract = {Motivation: The genome of Arabidopsis thaliana, which has the best understood plant genome, still has approximately one-third of its genes with no functional annotation at all from either MIPS or TAIR. We have applied our Data Mining Prediction (DMP) method to the problem of predicting the functional classes of these protein sequences. This method is based on using a hybrid machine-learning/data-mining method to identify patterns in the bioinformatic data about sequences that are predictive of function. We use data about sequence, predicted secondary structure, predicted structural domain, InterPro patterns, sequence similarity profile and expressions data.
Results: We predicted the functional class of a high percentage of the Arabidopsis genes with currently unknown function. These predictions are interpretable and have good test accuracies. We describe in detail seven of the rules produced.
Availability: Rulesets are available at http://www.aber.ac.uk/compsci/Research/bio/dss/arabpreds/ and predictions are available at http://www.genepredictions.org
Contact:afc@aber.ac.uk},
keywords = {bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {article}
}
Results: We predicted the functional class of a high percentage of the Arabidopsis genes with currently unknown function. These predictions are interpretable and have good test accuracies. We describe in detail seven of the rules produced.
Availability: Rulesets are available at http://www.aber.ac.uk/compsci/Research/bio/dss/arabpreds/ and predictions are available at http://www.genepredictions.org
Contact:afc@aber.ac.uk
2002
Karwath, Andreas; King, Ross D.
Homology Induction: the use of machine learning to improve sequence similarity searches Journal Article
In: BMC Bioinformatics, vol. 3, no. 1, 2002.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, machine learning, relational learning
@article{karwath02a,
title = {Homology Induction: the use of machine learning to improve sequence similarity searches},
author = {Andreas Karwath and Ross D. King},
url = {http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-3-11},
doi = {10.1186/1471-2105-3-11},
year = {2002},
date = {2002-04-23},
journal = {BMC Bioinformatics},
volume = {3},
number = {1},
abstract = {Background
The inference of homology between proteins is a key problem in molecular biology The current best approaches only identify ~50% of homologies (with a false positive rate set at 1/1000).
Results
We present Homology Induction (HI), a new approach to inferring homology. HI uses machine learning to bootstrap from standard sequence similarity search methods. First a standard method is run, then HI learns rules which are true for sequences of high similarity to the target (assumed homologues) and not true for general sequences, these rules are then used to discriminate sequences in the twilight zone. To learn the rules HI describes the sequences in a novel way based on a bioinformatic knowledge base, and the machine learning method of inductive logic programming. To evaluate HI we used the PDB40D benchmark which lists sequences of known homology but low sequence similarity. We compared the HI methodoly with PSI-BLAST alone and found HI performed significantly better. In addition, Receiver Operating Characteristic (ROC) curve analysis showed that these improvements were robust for all reasonable error costs. The predictive homology rules learnt by HI by can be interpreted biologically to provide insight into conserved features of homologous protein families.
Conclusions
HI is a new technique for the detection of remote protein homolgy – a central bioinformatic problem. HI with PSI-BLAST is shown to outperform PSI-BLAST for all error costs. It is expect that similar improvements would be obtained using HI with any sequence similarity method.
},
keywords = {bioinformatics, data mining, inductive logic programming, machine learning, relational learning},
pubstate = {published},
tppubtype = {article}
}
The inference of homology between proteins is a key problem in molecular biology The current best approaches only identify ~50% of homologies (with a false positive rate set at 1/1000).
Results
We present Homology Induction (HI), a new approach to inferring homology. HI uses machine learning to bootstrap from standard sequence similarity search methods. First a standard method is run, then HI learns rules which are true for sequences of high similarity to the target (assumed homologues) and not true for general sequences, these rules are then used to discriminate sequences in the twilight zone. To learn the rules HI describes the sequences in a novel way based on a bioinformatic knowledge base, and the machine learning method of inductive logic programming. To evaluate HI we used the PDB40D benchmark which lists sequences of known homology but low sequence similarity. We compared the HI methodoly with PSI-BLAST alone and found HI performed significantly better. In addition, Receiver Operating Characteristic (ROC) curve analysis showed that these improvements were robust for all reasonable error costs. The predictive homology rules learnt by HI by can be interpreted biologically to provide insight into conserved features of homologous protein families.
Conclusions
HI is a new technique for the detection of remote protein homolgy – a central bioinformatic problem. HI with PSI-BLAST is shown to outperform PSI-BLAST for all error costs. It is expect that similar improvements would be obtained using HI with any sequence similarity method.
Karwath, Andreas
Large Logical Đatabases and their Applications to Molecular Biology PhD Thesis
University of Wales, Aberystwyth, 2002.
BibTeX | Tags: bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge
@phdthesis{karwath02b,
title = {Large Logical Đatabases and their Applications to Molecular Biology},
author = {Andreas Karwath},
year = {2002},
date = {2002-01-01},
school = {University of Wales, Aberystwyth},
keywords = {bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {phdthesis}
}
2001
Karwath, Andreas; King, Ross D.
An automated ILP server in the field of bioinformatics Conference
The Eleventh International Conference on Inductive Logic Programming, ILP 2001, vol. 2157, Lecture Notes in Computer Science Springer-Verlag Berlin Heidelberg Springer Verlag, Berlin Heidelberg, Germany, 2001, ISBN: 978-3-540-42538-0.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, machine learning, relational learning
@conference{Karwath2001,
title = {An automated ILP server in the field of bioinformatics},
author = {Andreas Karwath and Ross D. King},
editor = {Raghu Ramakrishnan and Michele Sebag},
url = {http://link.springer.com/chapter/10.1007%2F3-540-44797-0_8},
doi = {10.1007/3-540-44797-0_8},
isbn = {978-3-540-42538-0},
year = {2001},
date = {2001-09-09},
booktitle = {The Eleventh International Conference on Inductive Logic Programming, ILP 2001},
volume = {2157},
pages = {91-103},
publisher = {Springer Verlag},
address = {Berlin Heidelberg, Germany},
organization = {Springer-Verlag Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
abstract = {The identification of evolutionary related (homologous) proteins is a key problem in molecular biology. Here we present a inductive logic programming based method, Homology Induction (HI), which acts as a filter for existing sequence similarity searches to improve their performance in the detection of remote protein homologies. HI performs a PSI-BLAST search to generate positive, negative, and uncertain examples, and collects descriptions of these examples. It then learns rules to discriminate the positive and negative examples. The rules are used to filter the uncertain examples in the “twilight zone”. HI uses a multitable database of 51,430,710 pre-fabricated facts from a variety of biological sources, and the inductive logic programming system Aleph to induce rules. Hi was tested on an independent set of protein sequences with equal or less than 40 per cent sequence similarity (PDB40D). ROC analysis is performed showing that HI can significantly improve existing similarity searches. The method is automated and can be used via a web/mail interface.},
keywords = {bioinformatics, data mining, inductive logic programming, machine learning, relational learning},
pubstate = {published},
tppubtype = {conference}
}
King, Ross D.; Karwath, Andreas; Clare, Amanda; Dehaspe, Luc
The utility of different representations of protein sequence for predicting functional class Journal Article
In: Bioinformatics, vol. 17, no. 5, pp. 445-454, 2001.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, relational learning, scientific knowledge
@article{King2001a,
title = {The utility of different representations of protein sequence for predicting functional class},
author = {Ross D. King and Andreas Karwath and Amanda Clare and Luc Dehaspe},
url = {https://bioinformatics.oxfordjournals.org/content/17/5/445},
doi = {10.1093/bioinformatics/17.5.445},
year = {2001},
date = {2001-01-19},
journal = {Bioinformatics},
volume = {17},
number = {5},
pages = {445-454},
abstract = {Motivation: Data Mining Prediction (DMP) is a novel approach to predicting protein functional class from sequence. DMP works even in the absence of a homologous protein of known function. We investigate the utility of different ways of representing protein sequence in DMP (residue frequencies, phylogeny, predicted structure) using the Escherichia coli genome as a model.
Results: Using the different representations DMP learnt prediction rules that were more accurate than default at every level of function using every type of representation. The most effective way to represent sequence was using phylogeny (75% accuracy and 13% coverage of unassigned ORFs at the most general level of function: 69% accuracy and 7% coverage at the most detailed). We tested different methods for combining predictions from the different types of representation. These improved both the accuracy and coverage of predictions, e.g. 40% of all unassigned ORFs could be predicted at an estimated accuracy of 60% and 5% of unassigned ORFs could be predicted at an estimated accuracy of 86%.
Availability: The rules and data are freely available. Warmr is free to academics.
Contact: rdk@aber.ac.uk},
keywords = {bioinformatics, data mining, inductive logic programming, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {article}
}
Results: Using the different representations DMP learnt prediction rules that were more accurate than default at every level of function using every type of representation. The most effective way to represent sequence was using phylogeny (75% accuracy and 13% coverage of unassigned ORFs at the most general level of function: 69% accuracy and 7% coverage at the most detailed). We tested different methods for combining predictions from the different types of representation. These improved both the accuracy and coverage of predictions, e.g. 40% of all unassigned ORFs could be predicted at an estimated accuracy of 60% and 5% of unassigned ORFs could be predicted at an estimated accuracy of 86%.
Availability: The rules and data are freely available. Warmr is free to academics.
Contact: rdk@aber.ac.uk
2000
King, Ross D.; Karwath, Andreas; Clare, Amanda; Dehaspe, Luc
Accurate prediction of protein functional class from sequence in the Mycobacterium tuberculosis and Escherichia coli genomes using data mining. Journal Article
In: Yeast (Comparative and Functional Genomics), vol. 17, pp. 283-293, 2000.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, relational learning, scientific knowledge
@article{king00a,
title = {Accurate prediction of protein functional class from sequence in the Mycobacterium tuberculosis and Escherichia coli genomes using data mining.},
author = {Ross D. King and Andreas Karwath and Amanda Clare and Luc Dehaspe},
url = {http://onlinelibrary.wiley.com/doi/10.1002/1097-0061(200012)17:4%3C283::AID-YEA52%3E3.0.CO;2-F/abstract},
doi = {10.1002/1097-0061(200012)17:4<283::AID-YEA52>3.0.CO;2-F},
year = {2000},
date = {2000-12-08},
journal = {Yeast (Comparative and Functional Genomics)},
volume = {17},
pages = {283-293},
abstract = {The analysis of genomics data needs to become as automated as its generation. Here we present a novel data-mining approach to predicting protein functional class from sequence. This method is based on a combination of inductive logic programming clustering and rule learning. We demonstrate the effectiveness of this approach on the M. tuberculosis and E. coli genomes, and identify biologically interpretable rules which predict protein functional class from information only available from the sequence. These rules predict 65% of the ORFs with no assigned function in M. tuberculosis and 24% of those in E. coli, with an estimated accuracy of 60–80% (depending on the level of functional assignment). The rules are founded on a combination of detection of remote homology, convergent evolution and horizontal gene transfer. We identify rules that predict protein functional class even in the absence of detectable sequence or structural homology. These rules give insight into the evolutionary history of M. tuberculosis and E. coli. },
keywords = {bioinformatics, data mining, inductive logic programming, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {article}
}
King, Ross D.; Karwath, Andreas; Clare, Amanda; Dehaspe, Luc
Logic and the Automatic Acquisition of Scientific Knowledge Journal Article
In: EACIS (Electronic Articles in Computer and Information Science), vol. 5, no. 031, 2000.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, scientific knowledge
@article{King2000c,
title = {Logic and the Automatic Acquisition of Scientific Knowledge},
author = {Ross D. King and Andreas Karwath and Amanda Clare and Luc Dehaspe},
url = {http://www.ida.liu.se/ext/epa/cis/mi-17/02/orig.html},
year = {2000},
date = {2000-12-01},
journal = {EACIS (Electronic Articles in Computer and Information Science)},
volume = {5},
number = {031},
abstract = {This paper is a manifesto. It argues that:
Science is experiencing an unprecedented "explosion" in the amount of available data.
Traditional data analysis methods cannot deal with this increased quantity of data.
There is therefore an urgent need to automate the process of refining scientific data into scientific knowledge.
Inductive logic programming (ILP) is the data analysis framework best suited for this task.
We describe an example of using ILP to analyse a large and complex bioinformatic database which produced unexpected and interesting scientific results. We then point a possible way forward to integrating machine learning with scientific databases to form intelligent inductive databases.},
keywords = {bioinformatics, data mining, scientific knowledge},
pubstate = {published},
tppubtype = {article}
}
Science is experiencing an unprecedented "explosion" in the amount of available data.
Traditional data analysis methods cannot deal with this increased quantity of data.
There is therefore an urgent need to automate the process of refining scientific data into scientific knowledge.
Inductive logic programming (ILP) is the data analysis framework best suited for this task.
We describe an example of using ILP to analyse a large and complex bioinformatic database which produced unexpected and interesting scientific results. We then point a possible way forward to integrating machine learning with scientific databases to form intelligent inductive databases.
King, Ross D.; Karwath, Andreas; Clare, Amanda; Dehaspe, Luc
Genome scale prediction of protein functional class from sequence using data mining Conference
The Sixth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, KDD 2000, The Association for Computing Machinery, New York, USA, 2000.
Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, relational learning
@conference{King2000,
title = {Genome scale prediction of protein functional class from sequence using data mining},
author = {Ross D. King and Andreas Karwath and Amanda Clare and Luc Dehaspe},
editor = {Raghu Ramakrishnan and S. Stolfo and R. Bayardo and I. Parsa},
url = {http://doi.acm.org/10.1145/347090.347172},
doi = {10.1145/347090.347172},
year = {2000},
date = {2000-01-01},
booktitle = {The Sixth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, KDD 2000},
pages = {384-389},
publisher = {The Association for Computing Machinery, New York, USA},
keywords = {bioinformatics, data mining, inductive logic programming, relational learning},
pubstate = {published},
tppubtype = {conference}
}