Code Examples
A repository of 155 code examples for BeepBeep
KmeansSymbolDistribution.java
1 /*
2  BeepBeep, an event stream processor
3  Copyright (C) 2008-2017 Sylvain HallĂ©
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU Lesser General Public License as published
7  by the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17  */
18 package mining.extraction;
19 
20 import static ca.uqac.lif.cep.Connector.INPUT;
21 import static ca.uqac.lif.cep.Connector.OUTPUT;
22 
23 import java.util.Set;
24 
25 import ca.uqac.lif.cep.Connector;
26 import ca.uqac.lif.cep.GroupProcessor;
27 import ca.uqac.lif.cep.functions.StreamVariable;
28 import ca.uqac.lif.cep.functions.CumulativeFunction;
29 import ca.uqac.lif.cep.functions.Cumulate;
30 import ca.uqac.lif.cep.functions.FunctionException;
31 import ca.uqac.lif.cep.functions.ApplyFunction;
32 import ca.uqac.lif.cep.functions.FunctionTree;
33 import ca.uqac.lif.cep.functions.IdentityFunction;
34 import ca.uqac.lif.cep.functions.TurnInto;
35 import ca.uqac.lif.cep.peg.MapDistance.ToValueArray;
36 import ca.uqac.lif.cep.peg.Normalize;
37 import ca.uqac.lif.cep.peg.Sequence;
38 import ca.uqac.lif.cep.peg.ml.KMeansFunction;
39 import ca.uqac.lif.cep.peg.ml.ProcessorMiningFunction;
40 import ca.uqac.lif.cep.tmf.Slice;
41 import ca.uqac.lif.cep.util.Numbers;
42 import mining.SequenceReader;
43 
44 /**
45  * Create clusters over the distribution of symbols in a set of input
46  * streams.
47  * <p>
48  * In this example, input traces are made of symbols <tt>a</tt> and
49  * <tt>b</tt>. A set of seven such traces is located in the file
50  * <tt>strings-1.csv</tt>. For each of these traces, the pattern
51  * processor &beta; computes a <em>feature vector</em> made of two numbers,
52  * corresponding to the fraction of <tt>a</tt>'s and <tt>b</tt>'s in
53  * the trace. This is done by
54  * <ul>
55  * <li>slicing the input trace according to the current symbol</li>
56  * <li>counting the events in each slice</li>
57  * <li>taking the values of the resulting map into a list, and normalizing
58  * them (so that their sum is equal to 1)</li>
59  * </ul>
60  * For example, on the input sequence
61  * <pre>
62  * a, b, a, a, b, b
63  * </pre>
64  * the resulting feature vector would be (0.4, 0.6).
65  * <p>
66  * We then use the K-means clustering algorithm to find the centroids of
67  * two clusters based on those feature vectors.
68  * <p>
69  * The processor mining function is therefore parameterized as follows:
70  * <p>
71  * <table>
72  * <tr><th>Parameter</th><th>Value</th></tr>
73  * <tr>
74  * <td><img src="./doc-files/mining/trenddistance/BetaProcessor.png" alt="Processor graph"></td>
75  * <td><img src="./doc-files/mining/extraction/SymbolDistributionDoublePoint.png" alt="Processor graph"></td>
76  * </tr>
77  * <tr>
78  * <td><img src="./doc-files/mining/extraction/AlphaProcessor.png" alt="Processor graph"></td>
79  * <td><img src="./doc-files/mining/extraction/Kmeans-Function.png" alt="Processor graph"> (K-means with K=2)</td>
80  * </tr>
81  * </table>
82  * <p>
83  * The traces in the input CSV file either have an approximate 30%-70%
84  * distribution of <tt>a</tt>'s and <tt>b</tt>'s, or the reverse. The
85  * feature vectors can be plotted as follows, with each dot representing
86  * the a-b distribution of a single trace.
87  * <p>
88  * <img src="./doc-files/mining/extraction/ClusteringAB.png" alt="Clustering graph">
89  * <p>
90  * Applying the k-means algorithm, with k=2, will compute two cluster centers,
91  * represented by crosses in the above plot.
92  *
93  * @author Sylvain HallĂ©
94  *
95  */
97 {
98  public static void main(String[] args) throws FunctionException
99  {
100  /* First, we must get from somewhere a set of sequences. For the sake
101  * of this example, we just create a few dummy sequences of numbers
102  * from the contents of a file. */
103  Set<Sequence<String>> sequences = SequenceReader.readStringSequences("strings-1.csv");
104 
105  /* We then create a processor that computes the feature vector
106  * from an input trace. */
107  GroupProcessor vector = new GroupProcessor(1, 1);
108  {
109  GroupProcessor counter = new GroupProcessor(1, 1);
110  {
111  TurnInto one = new TurnInto(1);
112  counter.associateInput(INPUT, one, INPUT);
113  Cumulate sum_one = new Cumulate(new CumulativeFunction<Number>(Numbers.addition));
114  Connector.connect(one, sum_one);
115  counter.associateOutput(OUTPUT, sum_one, OUTPUT);
116  counter.addProcessors(one, sum_one);
117  }
118  Slice slicer = new Slice(new IdentityFunction(1), counter);
119  ApplyFunction to_normalized_vector = new ApplyFunction(
120  new FunctionTree(Normalize.instance,
121  new FunctionTree(ToValueArray.instance, StreamVariable.X)));
122  Connector.connect(slicer, to_normalized_vector);
123  vector.associateInput(INPUT, slicer, INPUT);
124  vector.associateOutput(OUTPUT, to_normalized_vector, OUTPUT);
125  vector.addProcessors(slicer, to_normalized_vector);
126  }
127 
128  /* Finally, we instantiate a processor mining function, using the
129  * pattern processor defined above, and the K-means clustering
130  * algorithm as the aggregate function. */
131  ProcessorMiningFunction<String, Set<?>> pmf = new ProcessorMiningFunction<String,Set<?>>(vector, new ApplyFunction(new KMeansFunction(2)));
132 
133  /* Let us see the clusters computed by this mining function on the set
134  * of input sequences. There should be two centroids, one roughly
135  * corresponding to a 30-70 distribution of a's vs. b's, and the other
136  * one corresponding to a 70-30 distribution. */
137  Set<?> centroids = (Set<?>) pmf.mine(sequences);
138  System.out.println(centroids);
139  }
140 }
Extract patterns from input streams using data mining and statistical algorithms. ...
Utility class that creates a set of sequences from a file.
Create clusters over the distribution of symbols in a set of input streams.