Bringing down time constraint to packet level so that we will exclude those pairs...
[pingpong.git] / Code / Projects / PacketLevelSignatureExtractor / src / main / java / edu / uci / iotproject / detection / layer3 / Layer3ClusterMatcher.java
1 package edu.uci.iotproject.detection.layer3;
2
3 import edu.uci.iotproject.detection.AbstractClusterMatcher;
4 import edu.uci.iotproject.detection.ClusterMatcherObserver;
5 import edu.uci.iotproject.trafficreassembly.layer3.Conversation;
6 import edu.uci.iotproject.trafficreassembly.layer3.TcpReassembler;
7 import edu.uci.iotproject.analysis.TcpConversationUtils;
8 import edu.uci.iotproject.io.PcapHandleReader;
9 import edu.uci.iotproject.util.PrintUtils;
10 import org.pcap4j.core.*;
11
12 import java.time.ZoneId;
13 import java.util.*;
14 import java.util.stream.Collectors;
15
16 import static edu.uci.iotproject.util.PcapPacketUtils.*;
17
18 /**
19  * Searches a traffic trace for sequences of packets "belong to" a given cluster (in other words, attempts to classify
20  * traffic as pertaining to a given cluster).
21  *
22  * @author Janus Varmarken {@literal <jvarmark@uci.edu>}
23  * @author Rahmadi Trimananda {@literal <rtrimana@uci.edu>}
24  */
25 public class Layer3ClusterMatcher extends AbstractClusterMatcher implements PacketListener {
26
27     /**
28      * The ordered directions of packets in the sequences that make up {@link #mCluster}.
29      */
30     private final Conversation.Direction[] mClusterMemberDirections;
31
32     /**
33      * For reassembling the observed traffic into TCP connections.
34      */
35     private final TcpReassembler mTcpReassembler = new TcpReassembler();
36
37     /**
38      * IP of the router's WAN port (if analyzed traffic is captured at the ISP's point of view).
39      */
40     private final String mRouterWanIp;
41
42     /**
43      * Epsilon value used by the DBSCAN algorithm; it is used again for range-based matching here.
44      */
45     private final double mEps;
46
47     /**
48      * Create a {@link Layer3ClusterMatcher}.
49      * @param cluster The cluster that traffic is matched against.
50      * @param routerWanIp The router's WAN IP if examining traffic captured at the ISP's point of view (used for
51      *                    determining the direction of packets).
52      * @param eps The epsilon value used in the DBSCAN algorithm.
53      * @param isRangeBased The boolean that decides if it is range-based vs. strict matching.
54      * @param detectionObservers Client code that wants to get notified whenever the {@link Layer3ClusterMatcher} detects that
55      *                          (a subset of) the examined traffic is similar to the traffic that makes up
56      *                          {@code cluster}, i.e., when the examined traffic is classified as pertaining to
57      *                          {@code cluster}.
58      */
59     public Layer3ClusterMatcher(List<List<PcapPacket>> cluster, String routerWanIp, boolean isRangeBased, double eps,
60                                 ClusterMatcherObserver... detectionObservers) {
61         super(cluster, isRangeBased);
62         Objects.requireNonNull(detectionObservers, "detectionObservers cannot be null");
63         for (ClusterMatcherObserver obs : detectionObservers) {
64             addObserver(obs);
65         }
66         // Build the cluster members' direction sequence.
67         // Note: assumes that the provided cluster was captured within the local network (routerWanIp is set to null).
68         mClusterMemberDirections = getPacketDirections(cluster.get(0), null);
69         /*
70          * Enforce restriction on cluster members: all representatives must exhibit the same direction pattern and
71          * contain the same number of packets. Note that this is a somewhat heavy operation, so it may be disabled later
72          * on in favor of performance. However, it is only run once (at instantiation), so the overhead may be warranted
73          * in order to ensure correctness, especially during the development/debugging phase.
74          */
75         if (!isRangeBased) {    // Only when it is not range-based
76             if (mCluster.stream().
77                     anyMatch(inner -> !Arrays.equals(mClusterMemberDirections, getPacketDirections(inner, null)))) {
78                 throw new IllegalArgumentException(
79                         "cluster members must contain the same number of packets and exhibit the same packet direction " +
80                                 "pattern"
81                 );
82             }
83         }
84         mEps = eps;
85         mRouterWanIp = routerWanIp;
86     }
87
88     @Override
89     public void gotPacket(PcapPacket packet) {
90         // Present packet to TCP reassembler so that it can be mapped to a connection (if it is a TCP packet).
91         mTcpReassembler.gotPacket(packet);
92     }
93
94     /**
95      * Get the cluster that describes the packet sequence that this {@link Layer3ClusterMatcher} is searching for.
96      * @return the cluster that describes the packet sequence that this {@link Layer3ClusterMatcher} is searching for.
97      */
98     public List<List<PcapPacket>> getCluster() {
99         return mCluster;
100     }
101
102     public void performDetectionRangeBased() {
103         /*
104          * Let's start out simple by building a version that only works for signatures that do not span across multiple
105          * TCP conversations...
106          */
107         for (Conversation c : mTcpReassembler.getTcpConversations()) {
108             if (c.isTls() && c.getTlsApplicationDataPackets().isEmpty() || !c.isTls() && c.getPackets().isEmpty()) {
109                 // Skip empty conversations.
110                 continue;
111             }
112             List<PcapPacket> lowerBound = mCluster.get(0);
113             List<PcapPacket> upperBound = mCluster.get(1);
114             if (isTlsSequence(lowerBound) != c.isTls() || isTlsSequence(upperBound) != c.isTls()) {
115                 // We consider it a mismatch if one is a TLS application data sequence and the other is not.
116                 continue;
117             }
118             // Fetch set of packets to examine based on TLS or not.
119             List<PcapPacket> cPkts = c.isTls() ? c.getTlsApplicationDataPackets() : c.getPackets();
120             Optional<List<PcapPacket>> match;
121             while ((match = findSubsequenceInSequence(lowerBound, upperBound, cPkts, mClusterMemberDirections, null)).
122                     isPresent()) {
123                 List<PcapPacket> matchSeq = match.get();
124                 // Notify observers about the match.
125                 mObservers.forEach(o -> o.onMatch(Layer3ClusterMatcher.this, matchSeq));
126                 /*
127                  * Get the index in cPkts of the last packet in the sequence of packets that matches the searched
128                  * signature sequence.
129                  */
130                 int matchSeqEndIdx = cPkts.indexOf(matchSeq.get(matchSeq.size() - 1));
131                 // We restart the search for the signature sequence immediately after that index, so truncate cPkts.
132                 cPkts = cPkts.stream().skip(matchSeqEndIdx + 1).collect(Collectors.toList());
133             }
134         }
135     }
136
137     public void performDetectionConservative() {
138         /*
139          * Let's start out simple by building a version that only works for signatures that do not span across multiple
140          * TCP conversations...
141          */
142         for (Conversation c : mTcpReassembler.getTcpConversations()) {
143             if (c.isTls() && c.getTlsApplicationDataPackets().isEmpty() || !c.isTls() && c.getPackets().isEmpty()) {
144                 // Skip empty conversations.
145                 continue;
146             }
147             for (List<PcapPacket> signatureSequence : mCluster) {
148                 if (isTlsSequence(signatureSequence) != c.isTls()) {
149                     // We consider it a mismatch if one is a TLS application data sequence and the other is not.
150                     continue;
151                 }
152                 // Fetch set of packets to examine based on TLS or not.
153                 List<PcapPacket> cPkts = c.isTls() ? c.getTlsApplicationDataPackets() : c.getPackets();
154                 /*
155                  * Note: we embed the attempt to detect the signature sequence in a loop in order to capture those cases
156                  * where the same signature sequence appears multiple times in one Conversation.
157                  *
158                  * Note: since we expect all sequences that together make up the signature to exhibit the same direction
159                  * pattern, we can simply pass the precomputed direction array for the signature sequence so that it
160                  * won't have to be recomputed internally in each call to findSubsequenceInSequence().
161                  */
162                 Optional<List<PcapPacket>> match;
163                 while ((match = findSubsequenceInSequence(signatureSequence, cPkts, mClusterMemberDirections, null)).
164                         isPresent()) {
165                     List<PcapPacket> matchSeq = match.get();
166                     // Notify observers about the match.
167                     mObservers.forEach(o -> o.onMatch(Layer3ClusterMatcher.this, matchSeq));
168                     /*
169                      * Get the index in cPkts of the last packet in the sequence of packets that matches the searched
170                      * signature sequence.
171                      */
172                     int matchSeqEndIdx = cPkts.indexOf(matchSeq.get(matchSeq.size() - 1));
173                     // We restart the search for the signature sequence immediately after that index, so truncate cPkts.
174                     cPkts = cPkts.stream().skip(matchSeqEndIdx + 1).collect(Collectors.toList());
175                 }
176             }
177
178             /*
179              * TODO:
180              * if no item in cluster matches, also perform a distance-based matching to cover those cases where we did
181              * not manage to capture every single mutation of the sequence during training.
182              *
183              * Need to compute average/centroid of cluster to do so...? Compute within-cluster variance, then check if
184              * distance between input conversation and cluster average/centroid is smaller than or equal to the computed
185              * variance?
186              */
187         }
188     }
189
190     /**
191      * Checks if {@code sequence} is a sequence of TLS packets. Note: the current implementation relies on inspection
192      * of the port numbers when deciding between TLS vs. non-TLS. Therefore, only the first packet of {@code sequence}
193      * is examined as it is assumed that all packets in {@code sequence} pertain to the same {@link Conversation} and
194      * hence share the same set of two src/dst port numbers (albeit possibly alternating between which one is the src
195      * and which one is the dst, as packets in {@code sequence} may be in alternating directions).
196      * @param sequence The sequence of packets for which it is to be determined if it is a sequence of TLS packets or
197      *                 non-TLS packets.
198      * @return {@code true} if {@code sequence} is a sequence of TLS packets, {@code false} otherwise.
199      */
200     private boolean isTlsSequence(List<PcapPacket> sequence) {
201         // NOTE: Assumes ALL packets in sequence pertain to the same TCP connection!
202         PcapPacket firstPkt = sequence.get(0);
203         int srcPort = getSourcePort(firstPkt);
204         int dstPort = getDestinationPort(firstPkt);
205         return TcpConversationUtils.isTlsPort(srcPort) || TcpConversationUtils.isTlsPort(dstPort);
206     }
207
208     /**
209      * Examine if a given sequence of packets ({@code sequence}) contains a given shorter sequence of packets
210      * ({@code subsequence}). Note: the current implementation actually searches for a substring as it does not allow
211      * for interleaving packets in {@code sequence} that are not in {@code subsequence}; for example, if
212      * {@code subsequence} consists of packet lengths [2, 3, 5] and {@code sequence} consists of  packet lengths
213      * [2, 3, 4, 5], the result will be that there is no match (because of the interleaving 4). If we are to allow
214      * interleaving packets, we need a modified version of
215      * <a href="https://stackoverflow.com/a/20545604/1214974">this</a>.
216      *
217      * @param subsequence The sequence to search for.
218      * @param sequence The sequence to search.
219      * @param subsequenceDirections The directions of packets in {@code subsequence} such that for all {@code i},
220      *                              {@code subsequenceDirections[i]} is the direction of the packet returned by
221      *                              {@code subsequence.get(i)}. May be set to {@code null}, in which this call will
222      *                              internally compute the packet directions.
223      * @param sequenceDirections The directions of packets in {@code sequence} such that for all {@code i},
224      *                           {@code sequenceDirections[i]} is the direction of the packet returned by
225      *                           {@code sequence.get(i)}. May be set to {@code null}, in which this call will internally
226      *                           compute the packet directions.
227      *
228      * @return An {@link Optional} containing the part of {@code sequence} that matches {@code subsequence}, or an empty
229      *         {@link Optional} if no part of {@code sequence} matches {@code subsequence}.
230      */
231     private Optional<List<PcapPacket>> findSubsequenceInSequence(List<PcapPacket> subsequence,
232                                                                  List<PcapPacket> sequence,
233                                                                  Conversation.Direction[] subsequenceDirections,
234                                                                  Conversation.Direction[] sequenceDirections) {
235         if (sequence.size() < subsequence.size()) {
236             // If subsequence is longer, it cannot be contained in sequence.
237             return Optional.empty();
238         }
239         if (isTlsSequence(subsequence) != isTlsSequence(sequence)) {
240             // We consider it a mismatch if one is a TLS application data sequence and the other is not.
241             return Optional.empty();
242         }
243         // If packet directions have not been precomputed by calling code, we need to construct them.
244         if (subsequenceDirections == null) {
245             subsequenceDirections = getPacketDirections(subsequence, mRouterWanIp);
246         }
247         if (sequenceDirections == null) {
248             sequenceDirections = getPacketDirections(sequence, mRouterWanIp);
249         }
250         int subseqIdx = 0;
251         int seqIdx = 0;
252         while (seqIdx < sequence.size()) {
253             PcapPacket subseqPkt = subsequence.get(subseqIdx);
254             PcapPacket seqPkt = sequence.get(seqIdx);
255             // We only have a match if packet lengths and directions match.
256             if (subseqPkt.getOriginalLength() == seqPkt.getOriginalLength() &&
257                     subsequenceDirections[subseqIdx] == sequenceDirections[seqIdx]) {
258                 // A match; advance both indices to consider next packet in subsequence vs. next packet in sequence.
259                 subseqIdx++;
260                 seqIdx++;
261                 if (subseqIdx == subsequence.size()) {
262                     // We managed to match the entire subsequence in sequence.
263                     // Return the sublist of sequence that matches subsequence.
264                     /*
265                      * TODO:
266                      * ASSUMES THE BACKING LIST (i.e., 'sequence') IS _NOT_ STRUCTURALLY MODIFIED, hence may not work
267                      * for live traces!
268                      */
269                     return Optional.of(sequence.subList(seqIdx - subsequence.size(), seqIdx));
270                 }
271             } else {
272                 // Mismatch.
273                 if (subseqIdx > 0) {
274                     /*
275                      * If we managed to match parts of subsequence, we restart the search for subsequence in sequence at
276                      * the index of sequence where the current mismatch occurred. I.e., we must reset subseqIdx, but
277                      * leave seqIdx untouched.
278                      */
279                     subseqIdx = 0;
280                 } else {
281                     /*
282                      * First packet of subsequence didn't match packet at seqIdx of sequence, so we move forward in
283                      * sequence, i.e., we continue the search for subsequence in sequence starting at index seqIdx+1 of
284                      * sequence.
285                      */
286                     seqIdx++;
287                 }
288             }
289         }
290         return Optional.empty();
291     }
292
293     /**
294      * Overloading the method {@code findSubsequenceInSequence} for range-based matching. Instead of a sequence,
295      * we have sequences of lower and upper bounds.
296      *
297      * @param lowerBound The lower bound of the sequence we search for.
298      * @param upperBound The upper bound of the sequence we search for.
299      * @param subsequenceDirections The directions of packets in {@code subsequence} such that for all {@code i},
300      *                              {@code subsequenceDirections[i]} is the direction of the packet returned by
301      *                              {@code subsequence.get(i)}. May be set to {@code null}, in which this call will
302      *                              internally compute the packet directions.
303      * @param sequenceDirections The directions of packets in {@code sequence} such that for all {@code i},
304      *                           {@code sequenceDirections[i]} is the direction of the packet returned by
305      *                           {@code sequence.get(i)}. May be set to {@code null}, in which this call will internally
306      *                           compute the packet directions.
307      *
308      * @return An {@link Optional} containing the part of {@code sequence} that matches {@code subsequence}, or an empty
309      *         {@link Optional} if no part of {@code sequence} matches {@code subsequence}.
310      */
311     private Optional<List<PcapPacket>> findSubsequenceInSequence(List<PcapPacket> lowerBound,
312                                                                  List<PcapPacket> upperBound,
313                                                                  List<PcapPacket> sequence,
314                                                                  Conversation.Direction[] subsequenceDirections,
315                                                                  Conversation.Direction[] sequenceDirections) {
316         // Just do the checks for either lower or upper bound!
317         // TODO: For now we use just the lower bound
318         if (sequence.size() < lowerBound.size()) {
319             // If subsequence is longer, it cannot be contained in sequence.
320             return Optional.empty();
321         }
322         if (isTlsSequence(lowerBound) != isTlsSequence(sequence)) {
323             // We consider it a mismatch if one is a TLS application data sequence and the other is not.
324             return Optional.empty();
325         }
326         // If packet directions have not been precomputed by calling code, we need to construct them.
327         if (subsequenceDirections == null) {
328             subsequenceDirections = getPacketDirections(lowerBound, mRouterWanIp);
329         }
330         if (sequenceDirections == null) {
331             sequenceDirections = getPacketDirections(sequence, mRouterWanIp);
332         }
333         int subseqIdx = 0;
334         int seqIdx = 0;
335         while (seqIdx < sequence.size()) {
336             PcapPacket lowBndPkt = lowerBound.get(subseqIdx);
337             PcapPacket upBndPkt = upperBound.get(subseqIdx);
338             PcapPacket seqPkt = sequence.get(seqIdx);
339             // We only have a match if packet lengths and directions match.
340             // The packet lengths have to be in the range of [lowerBound - eps, upperBound+eps]
341             // TODO: Maybe we could do better here for the double to integer conversion?
342             int epsLowerBound = lowBndPkt.length() - (int) mEps;
343             int epsUpperBound = upBndPkt.length() + (int) mEps;
344             if (epsLowerBound <= seqPkt.getOriginalLength() &&
345                     seqPkt.getOriginalLength() <= epsUpperBound &&
346                     subsequenceDirections[subseqIdx] == sequenceDirections[seqIdx]) {
347                 // A match; advance both indices to consider next packet in subsequence vs. next packet in sequence.
348                 subseqIdx++;
349                 seqIdx++;
350                 if (subseqIdx == lowerBound.size()) {
351                     // We managed to match the entire subsequence in sequence.
352                     // Return the sublist of sequence that matches subsequence.
353                     /*
354                      * TODO:
355                      * ASSUMES THE BACKING LIST (i.e., 'sequence') IS _NOT_ STRUCTURALLY MODIFIED, hence may not work
356                      * for live traces!
357                      */
358                     return Optional.of(sequence.subList(seqIdx - lowerBound.size(), seqIdx));
359                 }
360             } else {
361                 // Mismatch.
362                 if (subseqIdx > 0) {
363                     /*
364                      * If we managed to match parts of subsequence, we restart the search for subsequence in sequence at
365                      * the index of sequence where the current mismatch occurred. I.e., we must reset subseqIdx, but
366                      * leave seqIdx untouched.
367                      */
368                     subseqIdx = 0;
369                 } else {
370                     /*
371                      * First packet of subsequence didn't match packet at seqIdx of sequence, so we move forward in
372                      * sequence, i.e., we continue the search for subsequence in sequence starting at index seqIdx+1 of
373                      * sequence.
374                      */
375                     seqIdx++;
376                 }
377             }
378         }
379         return Optional.empty();
380     }
381
382     /**
383      * Given a cluster, produces a pruned version of that cluster. In the pruned version, there are no duplicate cluster
384      * members. Two cluster members are considered identical if their packets lengths and packet directions are
385      * identical. The resulting pruned cluster is unmodifiable (this applies to both the outermost list as well as the
386      * nested lists) in order to preserve its integrity when exposed to external code (e.g., through
387      * {@link #getCluster()}).
388      *
389      * @param cluster A cluster to prune.
390      * @return The resulting pruned cluster.
391      */
392     @Override
393     protected List<List<PcapPacket>> pruneCluster(List<List<PcapPacket>> cluster) {
394         List<List<PcapPacket>> prunedCluster = new ArrayList<>();
395         for (List<PcapPacket> originalClusterSeq : cluster) {
396             boolean alreadyPresent = false;
397             for (List<PcapPacket> prunedClusterSeq : prunedCluster) {
398                 Optional<List<PcapPacket>> duplicate = findSubsequenceInSequence(originalClusterSeq, prunedClusterSeq,
399                         mClusterMemberDirections, mClusterMemberDirections);
400                 if (duplicate.isPresent()) {
401                     alreadyPresent = true;
402                     break;
403                 }
404             }
405             if (!alreadyPresent) {
406                 prunedCluster.add(Collections.unmodifiableList(originalClusterSeq));
407             }
408         }
409         return Collections.unmodifiableList(prunedCluster);
410     }
411
412     /**
413      * Given a {@code List<PcapPacket>}, generate a {@code Conversation.Direction[]} such that each entry in the
414      * resulting {@code Conversation.Direction[]} specifies the direction of the {@link PcapPacket} at the corresponding
415      * index in the input list.
416      * @param packets The list of packets for which to construct a corresponding array of packet directions.
417      * @param routerWanIp The IP of the router's WAN port. This is used for determining the direction of packets when
418      *                    the traffic is captured just outside the local network (at the ISP side of the router). Set to
419      *                    {@code null} if {@code packets} stem from traffic captured within the local network.
420      * @return A {@code Conversation.Direction[]} specifying the direction of the {@link PcapPacket} at the
421      *         corresponding index in {@code packets}.
422      */
423     private static Conversation.Direction[] getPacketDirections(List<PcapPacket> packets, String routerWanIp) {
424         Conversation.Direction[] directions = new Conversation.Direction[packets.size()];
425         for (int i = 0; i < packets.size(); i++) {
426             PcapPacket pkt = packets.get(i);
427             if (getSourceIp(pkt).equals(getDestinationIp(pkt))) {
428                 // Sanity check: we shouldn't be processing loopback traffic
429                 throw new AssertionError("loopback traffic detected");
430             }
431             if (isSrcIpLocal(pkt) || getSourceIp(pkt).equals(routerWanIp)) {
432                 directions[i] = Conversation.Direction.CLIENT_TO_SERVER;
433             } else if (isDstIpLocal(pkt) || getDestinationIp(pkt).equals(routerWanIp)) {
434                 directions[i] = Conversation.Direction.SERVER_TO_CLIENT;
435             } else {
436                 //throw new IllegalArgumentException("no local IP or router WAN port IP found, can't detect direction");
437             }
438         }
439         return directions;
440     }
441
442 }