Code Examples
A repository of 155 code examples for BeepBeep
SymbolDistributionClusters.java
1 /*
2  BeepBeep, an event stream processor
3  Copyright (C) 2008-2017 Sylvain Hallé
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU Lesser General Public License as published
7  by the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17  */
18 package mining.trenddistance;
19 
20 import static ca.uqac.lif.cep.Connector.INPUT;
21 import static ca.uqac.lif.cep.Connector.OUTPUT;
22 
23 import java.util.HashSet;
24 import java.util.Set;
25 
26 import org.apache.commons.math3.ml.clustering.DoublePoint;
27 import org.apache.commons.math3.ml.distance.EuclideanDistance;
28 
29 import ca.uqac.lif.cep.Connector;
30 import ca.uqac.lif.cep.GroupProcessor;
31 import ca.uqac.lif.cep.Pullable;
32 import ca.uqac.lif.cep.functions.StreamVariable;
33 import ca.uqac.lif.cep.functions.CumulativeFunction;
34 import ca.uqac.lif.cep.functions.Cumulate;
35 import ca.uqac.lif.cep.functions.ApplyFunction;
36 import ca.uqac.lif.cep.functions.FunctionTree;
37 import ca.uqac.lif.cep.functions.IdentityFunction;
38 import ca.uqac.lif.cep.functions.TurnInto;
39 import ca.uqac.lif.cep.io.ReadStringStream;
40 import ca.uqac.lif.cep.peg.TrendDistance;
41 import ca.uqac.lif.cep.peg.ml.DistanceToClosest;
42 import ca.uqac.lif.cep.peg.ml.DoublePointCast;
43 import ca.uqac.lif.cep.peg.MapDistance.ToValueArray;
44 import ca.uqac.lif.cep.peg.Normalize;
45 import ca.uqac.lif.cep.tmf.Slice;
46 import ca.uqac.lif.cep.util.Numbers;
47 import ca.uqac.lif.cep.util.FindPattern;
48 
49 /**
50  * Trend distance based on the statistical distribution of symbols in a
51  * stream. In this example, a feature vector is computed from an input
52  * trace by calculating the fraction of a's and b's that occur in a sliding
53  * window of width 9
54  * (see {@link mining.extraction.KmeansSymbolDistribution KmeansSymbolDistribution}
55  * for an explanation of how this is computed).
56  * The reference pattern is a set of two-dimensional points, corresponding
57  * to the centroids of two clusters. The distance function computes the
58  * Euclidean distance between the computed feature vector and the
59  * <em>closest</em> centroid of the reference set. If this distance is greater
60  * than <i>d</i>=0.15, an alarm is raised.
61  * <p>
62  * For example, suppose that the two centroids have coordinates (0.7, 0.3) and
63  * (0.3, 0.7); they are represented by two crosses in the 2D plot below.
64  * <p>
65  * <img src="./doc-files/mining/extraction/ClusteringAB.png" alt="Plot">
66  * <p>
67  * Consider the following window of 9 events:
68  * <pre>
69  * a, b, a, b, a, b, a, b, a
70  * </pre>
71  * The feature vector extracted from this window is (0.56, 0.44) (red dot
72  * in the plot above). The centroid closest to this point is (0.7, 0.3),
73  * but its distance is 0.2, which is greater than 0.15. In that case, the
74  * feature vector is considered "too far" from existing clusters, and
75  * an alarm is raised.
76  * <p>
77  * The parameters of the <tt>TrendDistance</tt> processor in this example
78  * are as follows:
79  * <table>
80  * <tr><th>Parameter</th><th>Value</th></tr>
81  * <tr>
82  * <td><img src="./doc-files/mining/trenddistance/WidthParameter.png" alt="Window Width" title="The width of the window"></td>
83  * <td>9</td>
84  * </tr>
85  * <tr>
86  * <td><img src="./doc-files/mining/trenddistance/BetaProcessor.png" alt="Beta processor" title="The processor that computes the pattern over the current input stream"></td>
87  * <td><img src="./doc-files/mining/extraction/SymbolDistributionDoublePoint.png" alt="Processor chain"></td>
88  * </tr>
89  * <tr>
90  * <td><img src="./doc-files/mining/trenddistance/PatternParameter.png" alt="Reference Pattern" title="The reference pattern"></td>
91  * <td>{(0.7, 0.3), (0.3, 0.7)}</td>
92  * </tr>
93  * <tr>
94  * <td><img src="./doc-files/mining/trenddistance/DistanceFunction.png" alt="Distance Function" title="The function that computes the distance with respect to the reference pattern"></td>
95  * <td><img src="./doc-files/mining/trenddistance/DistanceToClosest.png" alt="Distance Function">
96  * ({@link ca.uqac.lif.cep.peg.ml.DistanceToClosest DistanceToClosest}
97  * using {@link org.apache.commons.math3.ml.distance.EuclideanDistance EuclideanDistance} metric)</td>
98  * </tr>
99  * <tr>
100  * <td><img src="./doc-files/mining/trenddistance/ComparisonFunction.png" alt="Comparison Function" title="The function that compares that distance with a given threshold"></td>
101  * <td><img src="./doc-files/mining/LessThanOrEqual.png" alt="&leq;"></td>
102  * </tr>
103  * <tr>
104  * <td><img src="./doc-files/mining/trenddistance/DistanceThreshold.png" alt="Distance Threshold" title="The distance threshold"></td>
105  * <td>¼</td>
106  * </tr>
107 
108  * </table>
109  *
110  * @author Sylvain Hallé
111  *
112  */
114 {
115  public static void main(String[] args)
116  {
117  ReadStringStream reader = new ReadStringStream(SymbolDistributionClusters.class.getResourceAsStream("SymbolDistribution-AB.txt"));
118  FindPattern feeder = new FindPattern("(.*?),");
119  Connector.connect(reader, feeder);
120  /* We then create a processor that computes the feature vector
121  * from an input trace. */
122  GroupProcessor vector = new GroupProcessor(1, 1);
123  {
124  GroupProcessor counter = new GroupProcessor(1, 1);
125  {
126  TurnInto one = new TurnInto(1);
127  counter.associateInput(INPUT, one, INPUT);
128  Cumulate sum_one = new Cumulate(new CumulativeFunction<Number>(Numbers.addition));
129  Connector.connect(one, sum_one);
130  counter.associateOutput(OUTPUT, sum_one, OUTPUT);
131  counter.addProcessors(one, sum_one);
132  }
133  Slice slicer = new Slice(new IdentityFunction(1), counter);
134  ApplyFunction to_normalized_vector = new ApplyFunction(
135  new FunctionTree(DoublePointCast.instance,
136  new FunctionTree(Normalize.instance,
137  new FunctionTree(ToValueArray.instance, StreamVariable.X))));
138  Connector.connect(slicer, to_normalized_vector);
139  vector.associateInput(INPUT, slicer, INPUT);
140  vector.associateOutput(OUTPUT, to_normalized_vector, OUTPUT);
141  vector.addProcessors(slicer, to_normalized_vector);
142  }
143  Connector.connect(feeder, vector);
144  Set<DoublePoint> pattern = new HashSet<DoublePoint>();
145  pattern.add(new DoublePoint(new double[]{0.7, 0.3}));
146  pattern.add(new DoublePoint(new double[]{0.3, 0.7}));
147  TrendDistance<Set<DoublePoint>,Set<DoublePoint>,Number> alarm = new TrendDistance<Set<DoublePoint>,Set<DoublePoint>,Number>(pattern, 9, vector, new FunctionTree(Numbers.absoluteValue,
148  new FunctionTree(new DistanceToClosest(new EuclideanDistance()), StreamVariable.X, StreamVariable.Y)), 0.25, Numbers.isLessThan);
149  Connector.connect(feeder, alarm);
150  Pullable p = alarm.getPullableOutput();
151  boolean b = true;
152  for (int i = 0; b && i < 10; i++)
153  {
154  b = (Boolean) p.pull();
155  System.out.println(b);
156  }
157  }
158 }
Trend distance based on the statistical distribution of symbols in a stream.