2018
Geilke, Michael; Karwath, Andreas; Frank, Eibe; Kramer, Stefan
Online estimation of discrete, continuous, and conditional joint densities using classifier chains Journal Article
In: Data Mining and Knowledge Discovery, vol. 32, no. 3, pp. 561-603, 2018, ISSN: 1384-5810.
Abstract | Links | BibTeX | Tags: artificial intelligence, data mining, density estimation, machine learning, stream mining
@article{geilke2018a,
title = {Online estimation of discrete, continuous, and conditional joint densities using classifier chains},
author = {Michael Geilke and Andreas Karwath and Eibe Frank and Stefan Kramer},
url = {https://doi.org/10.1007/s10618-017-0546-6},
doi = {10.1007/s10618-017-0546-6},
issn = {1384-5810},
year = {2018},
date = {2018-05-01},
urldate = {2018-05-01},
journal = {Data Mining and Knowledge Discovery},
volume = {32},
number = {3},
pages = {561-603},
publisher = {Springer US},
abstract = {We address the problem of estimating discrete, continuous, and conditional joint densities online, i.e., the algorithm is only provided the current example and its current estimate for its update. The family of proposed online density estimators, estimation of densities online (EDO), uses classifier chains to model dependencies among features, where each classifier in the chain estimates the probability of one particular feature. Because a single chain may not provide a reliable estimate, we also consider ensembles of classifier chains and ensembles of weighted classifier chains. For all density estimators, we provide consistency proofs and propose algorithms to perform certain inference tasks. The empirical evaluation of the estimators is conducted in several experiments and on datasets of up to several millions of instances. In the discrete case, we compare our estimators to density estimates computed by Bayesian structure learners. In the continuous case, we compare them to a state-of-the-art online density estimator. Our experiments demonstrate that, even though designed to work online, EDO delivers estimators of competitive accuracy compared to other density estimators (batch Bayesian structure learners on discrete datasets and the state-of-the-art online density estimator on continuous datasets). Besides achieving similar performance in these cases, EDO is also able to estimate densities with mixed types of variables, i.e., discrete and continuous random variables.},
keywords = {artificial intelligence, data mining, density estimation, machine learning, stream mining},
pubstate = {published},
tppubtype = {article}
}
2016
Geilke, Michael; Karwath, Andreas; Kramer, Stefan
Online density estimation of heterogeneous data streams in higher dimensions Conference
Machine learning and knowledge discovery in databases : European Conference, ECML PKDD 2016, Riva del Garda, Italy, September 19-23, 2016 : Proceedings Part 1, 2016.
Abstract | Links | BibTeX | Tags: data mining, density estimation, stream mining
@conference{geilke2016,
title = {Online density estimation of heterogeneous data streams in higher dimensions},
author = {Michael Geilke and Andreas Karwath and Stefan Kramer},
url = {http://link.springer.com/chapter/10.1007/978-3-319-46128-1_5},
doi = {doi:10.1007/978-3-319-46128-1_5},
year = {2016},
date = {2016-01-01},
urldate = {2016-01-01},
booktitle = {Machine learning and knowledge discovery in databases : European Conference, ECML PKDD 2016, Riva del Garda, Italy, September 19-23, 2016 : Proceedings Part 1},
pages = {65-80},
abstract = {The joint density of a data stream is suitable for performing data mining tasks without having access to the original data. However, the methods proposed so far only target a small to medium number of variables, since their estimates rely on representing all the interdependencies between the variables of the data. High-dimensional data streams, which are becoming more and more frequent due to increasing numbers of interconnected devices, are, therefore, pushing these methods to their limits. To mitigate these limitations, we present an approach that projects the original data stream into a vector space and uses a set of representatives to provide an estimate. Due to the structure of the estimates, it enables the density estimation of higher-dimensional data and approaches the true density with increasing dimensionality of the vector space. Moreover, it is not only designed to estimate homogeneous data, i.e., where all variables are nominal or all variables are numeric, but it can also estimate heterogeneous data. The evaluation is conducted on synthetic and real-world data. The software related to this paper is available at https://github.com/geilke/mideo.},
howpublished = {urlhttps://publications.UB.Uni-Mainz.DE/opus/frontdoor.php?source_opus=54808},
keywords = {data mining, density estimation, stream mining},
pubstate = {published},
tppubtype = {conference}
}
2015
Geilke, Michael; Karwath, Andreas; Kramer, Stefan
Modeling recurrent distributions in streams using possible worlds Conference
2015 IEEE International Conference on Data Science and Advanced Analytics, DSAA 2015, IEEE, 2015, ISBN: 978-1-4673-8272-4.
Abstract | Links | BibTeX | Tags: density estimation, machine learning, possible worlds, stream mining
@conference{geilke2015,
title = {Modeling recurrent distributions in streams using possible worlds},
author = {Michael Geilke and Andreas Karwath and Stefan Kramer},
url = {http://dx.doi.org/10.1109/DSAA.2015.7344814},
doi = {10.1109/DSAA.2015.7344814},
isbn = {978-1-4673-8272-4},
year = {2015},
date = {2015-10-19},
booktitle = {2015 IEEE International Conference on Data Science and Advanced Analytics, DSAA 2015},
pages = {1-9},
publisher = {IEEE},
crossref = {DBLP:conf/dsaa/2015},
abstract = {Discovering changes in the data distribution of streams and discovering recurrent data distributions are challenging problems in data mining and machine learning. Both have received a lot of attention in the context of classification. With the ever increasing growth of data, however, there is a high demand of compact and universal representations of data streams that enable the user to analyze current as well as historic data without having access to the raw data. To make a first step towards this direction, we propose a condensed representation that captures the various - possibly recurrent - data distributions of the stream by extending the notion of possible worlds. The representation enables queries concerning the whole stream and can, hence, serve as a tool for supporting decision-making processes or serve as a basis for implementing data mining and machine learning algorithms on top of it. We evaluate this condensed representation on synthetic and real-world data.
},
keywords = {density estimation, machine learning, possible worlds, stream mining},
pubstate = {published},
tppubtype = {conference}
}
2014
Geilke, Michael; Karwath, Andreas; Kramer, Stefan
A probabilistic condensed representation of data for stream mining Conference
International Conference on Data Science and Advanced Analytics, DSAA 2014, IEEE, 2014.
Abstract | Links | BibTeX | Tags: density estimation, machine learning, stream mining
@conference{geilke2014,
title = {A probabilistic condensed representation of data for stream mining},
author = {Michael Geilke and Andreas Karwath and Stefan Kramer},
url = {http://dx.doi.org/10.1109/DSAA.2014.7058088},
doi = {10.1109/DSAA.2014.7058088},
year = {2014},
date = {2014-10-30},
booktitle = {International Conference on Data Science and Advanced Analytics, DSAA 2014},
pages = {297-303},
publisher = {IEEE},
crossref = {DBLP:conf/dsaa/2014},
abstract = {Data mining and machine learning algorithms usually operate directly on the data. However, if the data is not available at once or consists of billions of instances, these algorithms easily become infeasible with respect to memory and run-time concerns. As a solution to this problem, we propose a framework, called MiDEO (Mining Density Estimates inferred Online), in which algorithms are designed to operate on a condensed representation of the data. In particular, we propose to use density estimates, which are able to represent billions of instances in a compact form and can be updated when new instances arrive. As an example for an algorithm that operates on density estimates, we consider the task of mining association rules, which we consider as a form of simple statements about the data. The algorithm, called POEt (Pattern mining on Online density esTimates), is evaluated on synthetic and real-world data and is compared to state-of-the-art algorithms.},
keywords = {density estimation, machine learning, stream mining},
pubstate = {published},
tppubtype = {conference}
}
2013
Geilke, Michael; Frank, Eibe; Karwath, Andreas; Kramer, Stefan
Online Estimation of Discrete Densities Conference
IEEE 13th International Conference on Data Mining, ICDM 2013, IEEE, 2013, ISSN: 1550-4786.
Abstract | Links | BibTeX | Tags: density estimation, machine learning, stream mining
@conference{geilke2013,
title = {Online Estimation of Discrete Densities},
author = {Michael Geilke and Eibe Frank and Andreas Karwath and Stefan Kramer},
url = {http://dx.doi.org/10.1109/ICDM.2013.91},
doi = {10.1109/ICDM.2013.91},
issn = {1550-4786},
year = {2013},
date = {2013-12-07},
booktitle = {IEEE 13th International Conference on Data Mining, ICDM 2013},
pages = {191-200},
publisher = {IEEE},
crossref = {DBLP:conf/icdm/2013},
abstract = {We address the problem of estimating a discrete joint density online, that is, the algorithm is only provided the current example and its current estimate. The proposed online estimator of discrete densities, EDDO (Estimation of Discrete Densities Online), uses classifier chains to model dependencies among features. Each classifier in the chain estimates the probability of one particular feature. Because a single chain may not provide a reliable estimate, we also consider ensembles of classifier chains and ensembles of weighted classifier chains. For all density estimators, we provide consistency proofs and propose algorithms to perform certain inference tasks. The empirical evaluation of the estimators is conducted in several experiments and on data sets of up to several million instances: We compare them to density estimates computed from Bayesian structure learners, evaluate them under the influence of noise, measure their ability to deal with concept drift, and measure the run-time performance. Our experiments demonstrate that, even though designed to work online, EDDO delivers estimators of competitive accuracy compared to batch Bayesian structure learners and batch variants of EDDO.},
keywords = {density estimation, machine learning, stream mining},
pubstate = {published},
tppubtype = {conference}
}