2018
Geilke, Michael; Karwath, Andreas; Frank, Eibe; Kramer, Stefan
Online estimation of discrete, continuous, and conditional joint densities using classifier chains Journal Article
In: Data Mining and Knowledge Discovery, vol. 32, no. 3, pp. 561-603, 2018, ISSN: 1384-5810.
Abstract | Links | BibTeX | Tags: artificial intelligence, data mining, density estimation, machine learning, stream mining
@article{geilke2018a,
title = {Online estimation of discrete, continuous, and conditional joint densities using classifier chains},
author = {Michael Geilke and Andreas Karwath and Eibe Frank and Stefan Kramer},
url = {https://doi.org/10.1007/s10618-017-0546-6},
doi = {10.1007/s10618-017-0546-6},
issn = {1384-5810},
year = {2018},
date = {2018-05-01},
urldate = {2018-05-01},
journal = {Data Mining and Knowledge Discovery},
volume = {32},
number = {3},
pages = {561-603},
publisher = {Springer US},
abstract = {We address the problem of estimating discrete, continuous, and conditional joint densities online, i.e., the algorithm is only provided the current example and its current estimate for its update. The family of proposed online density estimators, estimation of densities online (EDO), uses classifier chains to model dependencies among features, where each classifier in the chain estimates the probability of one particular feature. Because a single chain may not provide a reliable estimate, we also consider ensembles of classifier chains and ensembles of weighted classifier chains. For all density estimators, we provide consistency proofs and propose algorithms to perform certain inference tasks. The empirical evaluation of the estimators is conducted in several experiments and on datasets of up to several millions of instances. In the discrete case, we compare our estimators to density estimates computed by Bayesian structure learners. In the continuous case, we compare them to a state-of-the-art online density estimator. Our experiments demonstrate that, even though designed to work online, EDO delivers estimators of competitive accuracy compared to other density estimators (batch Bayesian structure learners on discrete datasets and the state-of-the-art online density estimator on continuous datasets). Besides achieving similar performance in these cases, EDO is also able to estimate densities with mixed types of variables, i.e., discrete and continuous random variables.},
keywords = {artificial intelligence, data mining, density estimation, machine learning, stream mining},
pubstate = {published},
tppubtype = {article}
}
2016
Geilke, Michael; Karwath, Andreas; Kramer, Stefan
Online density estimation of heterogeneous data streams in higher dimensions Conference
Machine learning and knowledge discovery in databases : European Conference, ECML PKDD 2016, Riva del Garda, Italy, September 19-23, 2016 : Proceedings Part 1, 2016.
Abstract | Links | BibTeX | Tags: data mining, density estimation, stream mining
@conference{geilke2016,
title = {Online density estimation of heterogeneous data streams in higher dimensions},
author = {Michael Geilke and Andreas Karwath and Stefan Kramer},
url = {http://link.springer.com/chapter/10.1007/978-3-319-46128-1_5},
doi = {doi:10.1007/978-3-319-46128-1_5},
year = {2016},
date = {2016-01-01},
urldate = {2016-01-01},
booktitle = {Machine learning and knowledge discovery in databases : European Conference, ECML PKDD 2016, Riva del Garda, Italy, September 19-23, 2016 : Proceedings Part 1},
pages = {65-80},
abstract = {The joint density of a data stream is suitable for performing data mining tasks without having access to the original data. However, the methods proposed so far only target a small to medium number of variables, since their estimates rely on representing all the interdependencies between the variables of the data. High-dimensional data streams, which are becoming more and more frequent due to increasing numbers of interconnected devices, are, therefore, pushing these methods to their limits. To mitigate these limitations, we present an approach that projects the original data stream into a vector space and uses a set of representatives to provide an estimate. Due to the structure of the estimates, it enables the density estimation of higher-dimensional data and approaches the true density with increasing dimensionality of the vector space. Moreover, it is not only designed to estimate homogeneous data, i.e., where all variables are nominal or all variables are numeric, but it can also estimate heterogeneous data. The evaluation is conducted on synthetic and real-world data. The software related to this paper is available at https://github.com/geilke/mideo.},
howpublished = {urlhttps://publications.UB.Uni-Mainz.DE/opus/frontdoor.php?source_opus=54808},
keywords = {data mining, density estimation, stream mining},
pubstate = {published},
tppubtype = {conference}
}
2014
Gütlein, Martin; Karwath, Andreas; Kramer, Stefan
CheS-Mapper 2.0 for visual validation of (Q)SAR models Journal Article
In: J. Cheminformatics, vol. 6, no. 1, pp. 41, 2014.
Abstract | Links | BibTeX | Tags: cheminformatics, data mining, graph mining, validation, visualization
@article{gutlein2014,
title = {CheS-Mapper 2.0 for visual validation of (Q)SAR models},
author = {Martin Gütlein and Andreas Karwath and Stefan Kramer},
url = {http://dx.doi.org/10.1186/s13321-014-0041-7},
doi = {10.1186/s13321-014-0041-7},
year = {2014},
date = {2014-09-23},
journal = {J. Cheminformatics},
volume = {6},
number = {1},
pages = {41},
abstract = {Background
Sound statistical validation is important to evaluate and compare the overall performance of (Q)SAR models. However, classical validation does not support the user in better understanding the properties of the model or the underlying data. Even though, a number of visualization tools for analyzing (Q)SAR information in small molecule datasets exist, integrated visualization methods that allow the investigation of model validation results are still lacking.
Results
We propose visual validation, as an approach for the graphical inspection of (Q)SAR model validation results. The approach applies the 3D viewer CheS-Mapper, an open-source application for the exploration of small molecules in virtual 3D space. The present work describes the new functionalities in CheS-Mapper 2.0, that facilitate the analysis of (Q)SAR information and allows the visual validation of (Q)SAR models. The tool enables the comparison of model predictions to the actual activity in feature space. The approach is generic: It is model-independent and can handle physico-chemical and structural input features as well as quantitative and qualitative endpoints.
Conclusions
Visual validation with CheS-Mapper enables analyzing (Q)SAR information in the data and indicates how this information is employed by the (Q)SAR model. It reveals, if the endpoint is modeled too specific or too generic and highlights common properties of misclassified compounds. Moreover, the researcher can use CheS-Mapper to inspect how the (Q)SAR model predicts activity cliffs. The CheS-Mapper software is freely available at http://ches-mapper.org.
Graphical abstract
Comparing actual and predicted activity values with CheS-Mapper.},
keywords = {cheminformatics, data mining, graph mining, validation, visualization},
pubstate = {published},
tppubtype = {article}
}
Sound statistical validation is important to evaluate and compare the overall performance of (Q)SAR models. However, classical validation does not support the user in better understanding the properties of the model or the underlying data. Even though, a number of visualization tools for analyzing (Q)SAR information in small molecule datasets exist, integrated visualization methods that allow the investigation of model validation results are still lacking.
Results
We propose visual validation, as an approach for the graphical inspection of (Q)SAR model validation results. The approach applies the 3D viewer CheS-Mapper, an open-source application for the exploration of small molecules in virtual 3D space. The present work describes the new functionalities in CheS-Mapper 2.0, that facilitate the analysis of (Q)SAR information and allows the visual validation of (Q)SAR models. The tool enables the comparison of model predictions to the actual activity in feature space. The approach is generic: It is model-independent and can handle physico-chemical and structural input features as well as quantitative and qualitative endpoints.
Conclusions
Visual validation with CheS-Mapper enables analyzing (Q)SAR information in the data and indicates how this information is employed by the (Q)SAR model. It reveals, if the endpoint is modeled too specific or too generic and highlights common properties of misclassified compounds. Moreover, the researcher can use CheS-Mapper to inspect how the (Q)SAR model predicts activity cliffs. The CheS-Mapper software is freely available at http://ches-mapper.org.
Graphical abstract
Comparing actual and predicted activity values with CheS-Mapper.
2012
Seeland, Madeleine; Karwath, Andreas; Kramer, Stefan
A structural cluster kernel for learning on graphs Conference
The 18th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, KDD 2012, ACM ACM, New York, NY, USA, 2012, ISBN: 978-1-4503-1462-6.
Abstract | Links | BibTeX | Tags: cheminformatics, clustering, data mining, kernels, QSAR, suport vector machines
@conference{seeland2012,
title = {A structural cluster kernel for learning on graphs},
author = {Madeleine Seeland and Andreas Karwath and Stefan Kramer},
url = {http://doi.acm.org/10.1145/2339530.2339614},
doi = {10.1145/2339530.2339614},
isbn = {978-1-4503-1462-6},
year = {2012},
date = {2012-08-12},
booktitle = {The 18th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, KDD 2012},
pages = {516-524},
publisher = {ACM},
address = {New York, NY, USA},
organization = {ACM},
crossref = {DBLP:conf/kdd/2012},
abstract = {In recent years, graph kernels have received considerable interest within the machine learning and data mining community. Here, we introduce a novel approach enabling kernel methods to utilize additional information hidden in the structural neighborhood of the graphs under consideration. Our novel structural cluster kernel (SCK) incorporates similarities induced by a structural clustering algorithm to improve state-of-the-art graph kernels. The approach taken is based on the idea that graph similarity can not only be described by the similarity between the graphs themselves, but also by the similarity they possess with respect to their structural neighborhood. We applied our novel kernel in a supervised and a semi-supervised setting to regression and classification problems on a number of real-world datasets of molecular graphs.
Our results show that the structural cluster similarity information can indeed leverage the prediction performance of the base kernel, particularly when the dataset is structurally sparse and consequently structurally diverse. By additionally taking into account a large number of unlabeled instances the performance of the structural cluster kernel can further be improved.},
keywords = {cheminformatics, clustering, data mining, kernels, QSAR, suport vector machines},
pubstate = {published},
tppubtype = {conference}
}
Our results show that the structural cluster similarity information can indeed leverage the prediction performance of the base kernel, particularly when the dataset is structurally sparse and consequently structurally diverse. By additionally taking into account a large number of unlabeled instances the performance of the structural cluster kernel can further be improved.
2010
Hardy, Barry J.; Douglas, Nicki; Helma, Christoph; Rautenberg, Micha; Jeliazkova, Nina; Jeliazkov, Vedrin; Nikolova, Ivelina; Benigni, Romualdo; Tcheremenskaia, Olga; Kramer, Stefan; Girschick, Tobias; Buchwald, Fabian; Wicker, Jörg; Karwath, Andreas; Gütlein, Martin; Maunz, Andreas; Sarimveis, Haralambos; Melagraki, Georgia; Afantitis, Antreas; Sopasakis, Pantelis; Gallagher, David; Poroikov, Vladimir; Filimonov, Dmitry; Zakharov, Alexey V.; Lagunin, Alexey; Gloriozova, Tatyana; Novikov, Sergey; Skvortsova, Natalia; Druzhilovsky, Dmitry; Chawla, Sunil; Ghosh, Indira; Ray, Surajit; Patel, Hitesh; Escher, Sylvia
Collaborative development of predictive toxicology applications Journal Article
In: J. Cheminformatics, vol. 2, pp. 7, 2010.
Abstract | Links | BibTeX | Tags: crossvalidation, data mining, QSAR, scientific knowledge, validation
@article{hardy2010,
title = {Collaborative development of predictive toxicology applications},
author = {Barry J. Hardy and Nicki Douglas and Christoph Helma and Micha Rautenberg and Nina Jeliazkova and Vedrin Jeliazkov and Ivelina Nikolova and Romualdo Benigni and Olga Tcheremenskaia and Stefan Kramer and Tobias Girschick and Fabian Buchwald and Jörg Wicker and Andreas Karwath and Martin Gütlein and Andreas Maunz and Haralambos Sarimveis and Georgia Melagraki and Antreas Afantitis and Pantelis Sopasakis and David Gallagher and Vladimir Poroikov and Dmitry Filimonov and Alexey V. Zakharov and Alexey Lagunin and Tatyana Gloriozova and Sergey Novikov and Natalia Skvortsova and Dmitry Druzhilovsky and Sunil Chawla and Indira Ghosh and Surajit Ray and Hitesh Patel and Sylvia Escher},
url = {http://dx.doi.org/10.1186/1758-2946-2-7},
doi = {10.1186/1758-2946-2-7},
year = {2010},
date = {2010-08-31},
urldate = {2010-08-31},
journal = {J. Cheminformatics},
volume = {2},
pages = {7},
abstract = {OpenTox provides an interoperable, standards-based Framework for the support of predictive toxicology data management, algorithms, modelling, validation and reporting. It is relevant to satisfying the chemical safety assessment requirements of the REACH legislation as it supports access to experimental data, (Quantitative) Structure-Activity Relationship models, and toxicological information through an integrating platform that adheres to regulatory requirements and OECD validation principles. Initial research defined the essential components of the Framework including the approach to data access, schema and management, use of controlled vocabularies and ontologies, architecture, web service and communications protocols, and selection and integration of algorithms for predictive modelling. OpenTox provides end-user oriented tools to non-computational specialists, risk assessors, and toxicological experts in addition to Application Programming Interfaces (APIs) for developers of new applications. OpenTox actively supports public standards for data representation, interfaces, vocabularies and ontologies, Open Source approaches to core platform components, and community-based collaboration approaches, so as to progress system interoperability goals.
The OpenTox Framework includes APIs and services for compounds, datasets, features, algorithms, models, ontologies, tasks, validation, and reporting which may be combined into multiple applications satisfying a variety of different user needs. OpenTox applications are based on a set of distributed, interoperable OpenTox API-compliant REST web services. The OpenTox approach to ontology allows for efficient mapping of complementary data coming from different datasets into a unifying structure having a shared terminology and representation.
Two initial OpenTox applications are presented as an illustration of the potential impact of OpenTox for high-quality and consistent structure-activity relationship modelling of REACH-relevant endpoints: ToxPredict which predicts and reports on toxicities for endpoints for an input chemical structure, and ToxCreate which builds and validates a predictive toxicity model based on an input toxicology dataset. Because of the extensible nature of the standardised Framework design, barriers of interoperability between applications and content are removed, as the user may combine data, models and validation from multiple sources in a dependable and time-effective way.},
keywords = {crossvalidation, data mining, QSAR, scientific knowledge, validation},
pubstate = {published},
tppubtype = {article}
}
The OpenTox Framework includes APIs and services for compounds, datasets, features, algorithms, models, ontologies, tasks, validation, and reporting which may be combined into multiple applications satisfying a variety of different user needs. OpenTox applications are based on a set of distributed, interoperable OpenTox API-compliant REST web services. The OpenTox approach to ontology allows for efficient mapping of complementary data coming from different datasets into a unifying structure having a shared terminology and representation.
Two initial OpenTox applications are presented as an illustration of the potential impact of OpenTox for high-quality and consistent structure-activity relationship modelling of REACH-relevant endpoints: ToxPredict which predicts and reports on toxicities for endpoints for an input chemical structure, and ToxCreate which builds and validates a predictive toxicity model based on an input toxicology dataset. Because of the extensible nature of the standardised Framework design, barriers of interoperability between applications and content are removed, as the user may combine data, models and validation from multiple sources in a dependable and time-effective way.
2007
King, Ross D.; Karwath, Andreas; Clare, Amanda; Dehaspe, Luc
Logic and the Automatic Acquisition of Scientific Knowledge: An Application to Functional Genomics Conference
Computational Discovery of Scientific Knowledge, Introduction, Techniques, and Applications in Environmental and Life Sciences, vol. 4660, Lecture Notes in Computer Science Springer-Verlag Berlin Heidelberg Springer Verlag, Berlin Heidelberg, Germany, 2007, ISBN: 978-3-540-73919-7.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge
@conference{king2007,
title = {Logic and the Automatic Acquisition of Scientific Knowledge: An Application to Functional Genomics},
author = {Ross D. King and Andreas Karwath and Amanda Clare and Luc Dehaspe},
url = {http://dx.doi.org/10.1007/978-3-540-73920-3_13},
doi = {10.1007/978-3-540-73920-3_13},
isbn = {978-3-540-73919-7},
year = {2007},
date = {2007-01-01},
booktitle = {Computational Discovery of Scientific Knowledge, Introduction, Techniques, and Applications in Environmental and Life Sciences},
volume = {4660},
pages = {273-289},
publisher = {Springer Verlag},
address = {Berlin Heidelberg, Germany},
organization = {Springer-Verlag Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
crossref = {DBLP:conf/dis/2007book},
abstract = {This paper is a manifesto aimed at computer scientists interested in developing and applying scientific discovery methods. It argues that: science is experiencing an unprecedented “explosion” in the amount of available data; traditional data analysis methods cannot deal with this increased quantity of data; there is an urgent need to automate the process of refining scientific data into scientific knowledge; inductive logic programming (ILP) is a data analysis framework well suited for this task; and exciting new scientific discoveries can be achieved using ILP scientific discovery methods. We describe an example of using ILP to analyse a large and complex bioinformatic database that has produced unexpected and interesting scientific results in functional genomics. We then point a possible way forward to integrating machine learning with scientific databases to form intelligent databases.},
keywords = {bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {conference}
}
2006
Clare, Amanda; Karwath, Andreas; Ougham, Helen; King, Ross D.
Functional bioinformatics for Arabidopsis thaliana Journal Article
In: Bioinformatics, vol. 22, no. 9, pp. 1130-1136, 2006.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge
@article{karwath06a,
title = {Functional bioinformatics for Arabidopsis thaliana},
author = {Amanda Clare and Andreas Karwath and Helen Ougham and Ross D. King},
url = {https://bioinformatics.oxfordjournals.org/content/22/9/1130.full.pdf+html},
doi = {10.1093/bioinformatics/btl051},
year = {2006},
date = {2006-01-01},
journal = {Bioinformatics},
volume = {22},
number = {9},
pages = {1130-1136},
abstract = {Motivation: The genome of Arabidopsis thaliana, which has the best understood plant genome, still has approximately one-third of its genes with no functional annotation at all from either MIPS or TAIR. We have applied our Data Mining Prediction (DMP) method to the problem of predicting the functional classes of these protein sequences. This method is based on using a hybrid machine-learning/data-mining method to identify patterns in the bioinformatic data about sequences that are predictive of function. We use data about sequence, predicted secondary structure, predicted structural domain, InterPro patterns, sequence similarity profile and expressions data.
Results: We predicted the functional class of a high percentage of the Arabidopsis genes with currently unknown function. These predictions are interpretable and have good test accuracies. We describe in detail seven of the rules produced.
Availability: Rulesets are available at http://www.aber.ac.uk/compsci/Research/bio/dss/arabpreds/ and predictions are available at http://www.genepredictions.org
Contact:afc@aber.ac.uk},
keywords = {bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {article}
}
Results: We predicted the functional class of a high percentage of the Arabidopsis genes with currently unknown function. These predictions are interpretable and have good test accuracies. We describe in detail seven of the rules produced.
Availability: Rulesets are available at http://www.aber.ac.uk/compsci/Research/bio/dss/arabpreds/ and predictions are available at http://www.genepredictions.org
Contact:afc@aber.ac.uk
Backofen, Rolf; Borrmann, Hans-Gunther; Deck, Werner; Dedner, Andreas; De Raedt, Luc; Desch, Klaus; Diesmann, Markus; Geier, Martin; Greiner, Andreas; Hess, Wolfgang R.; Honerkamp, Josef; Jankowski, Stefan; Krossing, Ingo; Liehr, Andreas W.; Karwath, Andreas; Klöfkorn, Robert; Pesché, Raphaël; Potjans, Tobias C.; Röttger, Michael C.; Schmidt-Thieme, Lars; Schneider, Gerhard; Voß, Björn; Wiebelt, Bernd; Wienemann, Peter; Winterer, Volker-Henning
A Bottom-up approach to Grid-Computing at a University: the Black-Forest-Grid Initiative Journal Article
In: Praxis der Informationsverarbeitung und Kommunikation, vol. 29, no. 2, pp. 81-87, 2006.
Abstract | Links | BibTeX | Tags: data mining, HPC
@article{backofen2006,
title = {A Bottom-up approach to Grid-Computing at a University: the Black-Forest-Grid Initiative},
author = {Rolf Backofen and Hans-Gunther Borrmann and Werner Deck and Andreas Dedner and De Raedt, Luc and Klaus Desch and Markus Diesmann and Martin Geier and Andreas Greiner and Wolfgang R. Hess and Josef Honerkamp and Stefan Jankowski and Ingo Krossing and Andreas W. Liehr and Andreas Karwath and Robert Klöfkorn and Raphaël Pesché and Tobias C. Potjans and Michael C. Röttger and Lars Schmidt-Thieme and Gerhard Schneider and Björn Voß and Bernd Wiebelt and Peter Wienemann and Volker-Henning Winterer},
url = {http://dx.doi.org/10.1515/PIKO.2006.81},
doi = {10.1515/PIKO.2006.81},
year = {2006},
date = {2006-01-01},
journal = {Praxis der Informationsverarbeitung und Kommunikation},
volume = {29},
number = {2},
pages = {81-87},
abstract = {Recent years have seen a rapid increase in the need for highperformance computing. These demands come from disciplines such as particle physics traditionally relying on High Performance Computing (HPC) but lately also from the various branches of life science that have matured into quantitative disciplines. The classical infrastructure of university computer centres results to be unsuited to cope with the new requirements for a multitude of reasons. Here we discuss the causes of this failure and present a solution developed at the University of Freiburg in a collaborative effort of several faculties. We demonstrate that using state of the art grid computing technology the problem can now be addressed in a bottom-up approach. The organizational, technical, and financial components of our framework, the Black Forest Grid Initiative (BFG) are described and results of its implementation are presented. In the process, a number of new questions have emerged which the next phase of our project needs to address.},
keywords = {data mining, HPC},
pubstate = {published},
tppubtype = {article}
}
2005
Stolle, Christian; Karwath, Andreas; De Raedt, Luc
CLASSIC'CL: an integrated ILP system Conference
Proc. 8th International Conference of Discovery Science, DS 2005, vol. 3735, Lecture Notes in Artificial Intelligence Springer, 2005, ISBN: 978-3-540-29230-2, (Conference).
Abstract | Links | BibTeX | Tags: data mining, machine learning, relational learning
@conference{karwath05a,
title = {CLASSIC'CL: an integrated ILP system},
author = {Christian Stolle and Andreas Karwath and De Raedt, Luc},
url = {http://link.springer.com/chapter/10.1007%2F11563983_31},
doi = {10.1007/11563983_31},
isbn = {978-3-540-29230-2},
year = {2005},
date = {2005-10-08},
booktitle = {Proc. 8th International Conference of Discovery Science, DS 2005},
volume = {3735},
pages = {354-362},
publisher = {Springer},
series = {Lecture Notes in Artificial Intelligence},
abstract = {A novel inductive logic programming system, called Classic’cl is presented. Classic’cl integrates several settings for learning, in particular learning from interpretations and learning from satisfiability. Within these settings, it addresses descriptive and probabilistic modeling tasks. As such, Classic’cl (C-armr, cLAudien, icl-S(S)at, ICl, and CLlpad) integrates several well-known inductive logic programming systems such as Claudien, Warmr (and its extension C-armr), ICL, ICL-SAT, and LLPAD. We report on the implementation, the integration issues as well as on some experiments that compare Classic’cl with some of its predecessors.},
note = {Conference},
keywords = {data mining, machine learning, relational learning},
pubstate = {published},
tppubtype = {conference}
}
2002
Karwath, Andreas; King, Ross D.
Homology Induction: the use of machine learning to improve sequence similarity searches Journal Article
In: BMC Bioinformatics, vol. 3, no. 1, 2002.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, machine learning, relational learning
@article{karwath02a,
title = {Homology Induction: the use of machine learning to improve sequence similarity searches},
author = {Andreas Karwath and Ross D. King},
url = {http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-3-11},
doi = {10.1186/1471-2105-3-11},
year = {2002},
date = {2002-04-23},
journal = {BMC Bioinformatics},
volume = {3},
number = {1},
abstract = {Background
The inference of homology between proteins is a key problem in molecular biology The current best approaches only identify ~50% of homologies (with a false positive rate set at 1/1000).
Results
We present Homology Induction (HI), a new approach to inferring homology. HI uses machine learning to bootstrap from standard sequence similarity search methods. First a standard method is run, then HI learns rules which are true for sequences of high similarity to the target (assumed homologues) and not true for general sequences, these rules are then used to discriminate sequences in the twilight zone. To learn the rules HI describes the sequences in a novel way based on a bioinformatic knowledge base, and the machine learning method of inductive logic programming. To evaluate HI we used the PDB40D benchmark which lists sequences of known homology but low sequence similarity. We compared the HI methodoly with PSI-BLAST alone and found HI performed significantly better. In addition, Receiver Operating Characteristic (ROC) curve analysis showed that these improvements were robust for all reasonable error costs. The predictive homology rules learnt by HI by can be interpreted biologically to provide insight into conserved features of homologous protein families.
Conclusions
HI is a new technique for the detection of remote protein homolgy – a central bioinformatic problem. HI with PSI-BLAST is shown to outperform PSI-BLAST for all error costs. It is expect that similar improvements would be obtained using HI with any sequence similarity method.
},
keywords = {bioinformatics, data mining, inductive logic programming, machine learning, relational learning},
pubstate = {published},
tppubtype = {article}
}
The inference of homology between proteins is a key problem in molecular biology The current best approaches only identify ~50% of homologies (with a false positive rate set at 1/1000).
Results
We present Homology Induction (HI), a new approach to inferring homology. HI uses machine learning to bootstrap from standard sequence similarity search methods. First a standard method is run, then HI learns rules which are true for sequences of high similarity to the target (assumed homologues) and not true for general sequences, these rules are then used to discriminate sequences in the twilight zone. To learn the rules HI describes the sequences in a novel way based on a bioinformatic knowledge base, and the machine learning method of inductive logic programming. To evaluate HI we used the PDB40D benchmark which lists sequences of known homology but low sequence similarity. We compared the HI methodoly with PSI-BLAST alone and found HI performed significantly better. In addition, Receiver Operating Characteristic (ROC) curve analysis showed that these improvements were robust for all reasonable error costs. The predictive homology rules learnt by HI by can be interpreted biologically to provide insight into conserved features of homologous protein families.
Conclusions
HI is a new technique for the detection of remote protein homolgy – a central bioinformatic problem. HI with PSI-BLAST is shown to outperform PSI-BLAST for all error costs. It is expect that similar improvements would be obtained using HI with any sequence similarity method.
Karwath, Andreas
Large Logical Đatabases and their Applications to Molecular Biology PhD Thesis
University of Wales, Aberystwyth, 2002.
BibTeX | Tags: bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge
@phdthesis{karwath02b,
title = {Large Logical Đatabases and their Applications to Molecular Biology},
author = {Andreas Karwath},
year = {2002},
date = {2002-01-01},
school = {University of Wales, Aberystwyth},
keywords = {bioinformatics, data mining, inductive logic programming, machine learning, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {phdthesis}
}
2001
Karwath, Andreas; King, Ross D.
An automated ILP server in the field of bioinformatics Conference
The Eleventh International Conference on Inductive Logic Programming, ILP 2001, vol. 2157, Lecture Notes in Computer Science Springer-Verlag Berlin Heidelberg Springer Verlag, Berlin Heidelberg, Germany, 2001, ISBN: 978-3-540-42538-0.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, machine learning, relational learning
@conference{Karwath2001,
title = {An automated ILP server in the field of bioinformatics},
author = {Andreas Karwath and Ross D. King},
editor = {Raghu Ramakrishnan and Michele Sebag},
url = {http://link.springer.com/chapter/10.1007%2F3-540-44797-0_8},
doi = {10.1007/3-540-44797-0_8},
isbn = {978-3-540-42538-0},
year = {2001},
date = {2001-09-09},
booktitle = {The Eleventh International Conference on Inductive Logic Programming, ILP 2001},
volume = {2157},
pages = {91-103},
publisher = {Springer Verlag},
address = {Berlin Heidelberg, Germany},
organization = {Springer-Verlag Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
abstract = {The identification of evolutionary related (homologous) proteins is a key problem in molecular biology. Here we present a inductive logic programming based method, Homology Induction (HI), which acts as a filter for existing sequence similarity searches to improve their performance in the detection of remote protein homologies. HI performs a PSI-BLAST search to generate positive, negative, and uncertain examples, and collects descriptions of these examples. It then learns rules to discriminate the positive and negative examples. The rules are used to filter the uncertain examples in the “twilight zone”. HI uses a multitable database of 51,430,710 pre-fabricated facts from a variety of biological sources, and the inductive logic programming system Aleph to induce rules. Hi was tested on an independent set of protein sequences with equal or less than 40 per cent sequence similarity (PDB40D). ROC analysis is performed showing that HI can significantly improve existing similarity searches. The method is automated and can be used via a web/mail interface.},
keywords = {bioinformatics, data mining, inductive logic programming, machine learning, relational learning},
pubstate = {published},
tppubtype = {conference}
}
King, Ross D.; Karwath, Andreas; Clare, Amanda; Dehaspe, Luc
The utility of different representations of protein sequence for predicting functional class Journal Article
In: Bioinformatics, vol. 17, no. 5, pp. 445-454, 2001.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, relational learning, scientific knowledge
@article{King2001a,
title = {The utility of different representations of protein sequence for predicting functional class},
author = {Ross D. King and Andreas Karwath and Amanda Clare and Luc Dehaspe},
url = {https://bioinformatics.oxfordjournals.org/content/17/5/445},
doi = {10.1093/bioinformatics/17.5.445},
year = {2001},
date = {2001-01-19},
journal = {Bioinformatics},
volume = {17},
number = {5},
pages = {445-454},
abstract = {Motivation: Data Mining Prediction (DMP) is a novel approach to predicting protein functional class from sequence. DMP works even in the absence of a homologous protein of known function. We investigate the utility of different ways of representing protein sequence in DMP (residue frequencies, phylogeny, predicted structure) using the Escherichia coli genome as a model.
Results: Using the different representations DMP learnt prediction rules that were more accurate than default at every level of function using every type of representation. The most effective way to represent sequence was using phylogeny (75% accuracy and 13% coverage of unassigned ORFs at the most general level of function: 69% accuracy and 7% coverage at the most detailed). We tested different methods for combining predictions from the different types of representation. These improved both the accuracy and coverage of predictions, e.g. 40% of all unassigned ORFs could be predicted at an estimated accuracy of 60% and 5% of unassigned ORFs could be predicted at an estimated accuracy of 86%.
Availability: The rules and data are freely available. Warmr is free to academics.
Contact: rdk@aber.ac.uk},
keywords = {bioinformatics, data mining, inductive logic programming, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {article}
}
Results: Using the different representations DMP learnt prediction rules that were more accurate than default at every level of function using every type of representation. The most effective way to represent sequence was using phylogeny (75% accuracy and 13% coverage of unassigned ORFs at the most general level of function: 69% accuracy and 7% coverage at the most detailed). We tested different methods for combining predictions from the different types of representation. These improved both the accuracy and coverage of predictions, e.g. 40% of all unassigned ORFs could be predicted at an estimated accuracy of 60% and 5% of unassigned ORFs could be predicted at an estimated accuracy of 86%.
Availability: The rules and data are freely available. Warmr is free to academics.
Contact: rdk@aber.ac.uk
2000
King, Ross D.; Karwath, Andreas; Clare, Amanda; Dehaspe, Luc
Accurate prediction of protein functional class from sequence in the Mycobacterium tuberculosis and Escherichia coli genomes using data mining. Journal Article
In: Yeast (Comparative and Functional Genomics), vol. 17, pp. 283-293, 2000.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, relational learning, scientific knowledge
@article{king00a,
title = {Accurate prediction of protein functional class from sequence in the Mycobacterium tuberculosis and Escherichia coli genomes using data mining.},
author = {Ross D. King and Andreas Karwath and Amanda Clare and Luc Dehaspe},
url = {http://onlinelibrary.wiley.com/doi/10.1002/1097-0061(200012)17:4%3C283::AID-YEA52%3E3.0.CO;2-F/abstract},
doi = {10.1002/1097-0061(200012)17:4<283::AID-YEA52>3.0.CO;2-F},
year = {2000},
date = {2000-12-08},
journal = {Yeast (Comparative and Functional Genomics)},
volume = {17},
pages = {283-293},
abstract = {The analysis of genomics data needs to become as automated as its generation. Here we present a novel data-mining approach to predicting protein functional class from sequence. This method is based on a combination of inductive logic programming clustering and rule learning. We demonstrate the effectiveness of this approach on the M. tuberculosis and E. coli genomes, and identify biologically interpretable rules which predict protein functional class from information only available from the sequence. These rules predict 65% of the ORFs with no assigned function in M. tuberculosis and 24% of those in E. coli, with an estimated accuracy of 60–80% (depending on the level of functional assignment). The rules are founded on a combination of detection of remote homology, convergent evolution and horizontal gene transfer. We identify rules that predict protein functional class even in the absence of detectable sequence or structural homology. These rules give insight into the evolutionary history of M. tuberculosis and E. coli. },
keywords = {bioinformatics, data mining, inductive logic programming, relational learning, scientific knowledge},
pubstate = {published},
tppubtype = {article}
}
King, Ross D.; Karwath, Andreas; Clare, Amanda; Dehaspe, Luc
Logic and the Automatic Acquisition of Scientific Knowledge Journal Article
In: EACIS (Electronic Articles in Computer and Information Science), vol. 5, no. 031, 2000.
Abstract | Links | BibTeX | Tags: bioinformatics, data mining, scientific knowledge
@article{King2000c,
title = {Logic and the Automatic Acquisition of Scientific Knowledge},
author = {Ross D. King and Andreas Karwath and Amanda Clare and Luc Dehaspe},
url = {http://www.ida.liu.se/ext/epa/cis/mi-17/02/orig.html},
year = {2000},
date = {2000-12-01},
journal = {EACIS (Electronic Articles in Computer and Information Science)},
volume = {5},
number = {031},
abstract = {This paper is a manifesto. It argues that:
Science is experiencing an unprecedented "explosion" in the amount of available data.
Traditional data analysis methods cannot deal with this increased quantity of data.
There is therefore an urgent need to automate the process of refining scientific data into scientific knowledge.
Inductive logic programming (ILP) is the data analysis framework best suited for this task.
We describe an example of using ILP to analyse a large and complex bioinformatic database which produced unexpected and interesting scientific results. We then point a possible way forward to integrating machine learning with scientific databases to form intelligent inductive databases.},
keywords = {bioinformatics, data mining, scientific knowledge},
pubstate = {published},
tppubtype = {article}
}
Science is experiencing an unprecedented "explosion" in the amount of available data.
Traditional data analysis methods cannot deal with this increased quantity of data.
There is therefore an urgent need to automate the process of refining scientific data into scientific knowledge.
Inductive logic programming (ILP) is the data analysis framework best suited for this task.
We describe an example of using ILP to analyse a large and complex bioinformatic database which produced unexpected and interesting scientific results. We then point a possible way forward to integrating machine learning with scientific databases to form intelligent inductive databases.
King, Ross D.; Karwath, Andreas; Clare, Amanda; Dehaspe, Luc
Genome scale prediction of protein functional class from sequence using data mining Conference
The Sixth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, KDD 2000, The Association for Computing Machinery, New York, USA, 2000.
Links | BibTeX | Tags: bioinformatics, data mining, inductive logic programming, relational learning
@conference{King2000,
title = {Genome scale prediction of protein functional class from sequence using data mining},
author = {Ross D. King and Andreas Karwath and Amanda Clare and Luc Dehaspe},
editor = {Raghu Ramakrishnan and S. Stolfo and R. Bayardo and I. Parsa},
url = {http://doi.acm.org/10.1145/347090.347172},
doi = {10.1145/347090.347172},
year = {2000},
date = {2000-01-01},
booktitle = {The Sixth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, KDD 2000},
pages = {384-389},
publisher = {The Association for Computing Machinery, New York, USA},
keywords = {bioinformatics, data mining, inductive logic programming, relational learning},
pubstate = {published},
tppubtype = {conference}
}