165cdb3e57f68de4319504a3fee4e15cd4efc34e
[pingpong.git] / Code / Projects / PacketLevelSignatureExtractor / src / main / java / edu / uci / iotproject / detection / layer3 / Layer3ClusterMatcher.java
1 package edu.uci.iotproject.detection.layer3;
2
3 import edu.uci.iotproject.analysis.TriggerTrafficExtractor;
4 import edu.uci.iotproject.detection.AbstractClusterMatcher;
5 import edu.uci.iotproject.detection.ClusterMatcherObserver;
6 import edu.uci.iotproject.trafficreassembly.layer3.Conversation;
7 import edu.uci.iotproject.trafficreassembly.layer3.TcpReassembler;
8 import edu.uci.iotproject.analysis.TcpConversationUtils;
9 import edu.uci.iotproject.io.PcapHandleReader;
10 import edu.uci.iotproject.util.PrintUtils;
11 import org.pcap4j.core.*;
12
13 import java.time.ZoneId;
14 import java.util.*;
15 import java.util.stream.Collectors;
16
17 import static edu.uci.iotproject.util.PcapPacketUtils.*;
18
19 /**
20  * Searches a traffic trace for sequences of packets "belong to" a given cluster (in other words, attempts to classify
21  * traffic as pertaining to a given cluster).
22  *
23  * @author Janus Varmarken {@literal <jvarmark@uci.edu>}
24  * @author Rahmadi Trimananda {@literal <rtrimana@uci.edu>}
25  */
26 public class Layer3ClusterMatcher extends AbstractClusterMatcher implements PacketListener {
27
28     /**
29      * The ordered directions of packets in the sequences that make up {@link #mCluster}.
30      */
31     private final Conversation.Direction[] mClusterMemberDirections;
32
33     /**
34      * For reassembling the observed traffic into TCP connections.
35      */
36     private final TcpReassembler mTcpReassembler = new TcpReassembler();
37
38     /**
39      * IP of the router's WAN port (if analyzed traffic is captured at the ISP's point of view).
40      */
41     private final String mRouterWanIp;
42
43     /**
44      * Epsilon value used by the DBSCAN algorithm; it is used again for range-based matching here.
45      */
46     private final double mEps;
47
48     /**
49      * The packet inclusion time for signature.
50      */
51     private int mInclusionTimeMillis;
52
53     /**
54      * Create a {@link Layer3ClusterMatcher}.
55      * @param cluster The cluster that traffic is matched against.
56      * @param routerWanIp The router's WAN IP if examining traffic captured at the ISP's point of view (used for
57      *                    determining the direction of packets).
58      * @param inclusionTimeMillis The packet inclusion time for signature.
59      * @param isRangeBased The boolean that decides if it is range-based vs. strict matching.
60      * @param eps The epsilon value used in the DBSCAN algorithm.
61      * @param detectionObservers Client code that wants to get notified whenever the {@link Layer3ClusterMatcher} detects that
62      *                          (a subset of) the examined traffic is similar to the traffic that makes up
63      *                          {@code cluster}, i.e., when the examined traffic is classified as pertaining to
64      *                          {@code cluster}.
65      */
66     public Layer3ClusterMatcher(List<List<PcapPacket>> cluster, String routerWanIp, int inclusionTimeMillis,
67                                 boolean isRangeBased, double eps,
68                                 ClusterMatcherObserver... detectionObservers) {
69         super(cluster, isRangeBased);
70         Objects.requireNonNull(detectionObservers, "detectionObservers cannot be null");
71         for (ClusterMatcherObserver obs : detectionObservers) {
72             addObserver(obs);
73         }
74         // Build the cluster members' direction sequence.
75         // Note: assumes that the provided cluster was captured within the local network (routerWanIp is set to null).
76         mClusterMemberDirections = getPacketDirections(cluster.get(0), null);
77         /*
78          * Enforce restriction on cluster members: all representatives must exhibit the same direction pattern and
79          * contain the same number of packets. Note that this is a somewhat heavy operation, so it may be disabled later
80          * on in favor of performance. However, it is only run once (at instantiation), so the overhead may be warranted
81          * in order to ensure correctness, especially during the development/debugging phase.
82          */
83         if (!isRangeBased) {    // Only when it is not range-based
84             if (mCluster.stream().
85                     anyMatch(inner -> !Arrays.equals(mClusterMemberDirections, getPacketDirections(inner, null)))) {
86                 throw new IllegalArgumentException(
87                         "cluster members must contain the same number of packets and exhibit the same packet direction " +
88                                 "pattern"
89                 );
90             }
91         }
92         mEps = eps;
93         mRouterWanIp = routerWanIp;
94         mInclusionTimeMillis =
95                 inclusionTimeMillis == 0 ? TriggerTrafficExtractor.INCLUSION_WINDOW_MILLIS : inclusionTimeMillis;
96     }
97
98     @Override
99     public void gotPacket(PcapPacket packet) {
100         // Present packet to TCP reassembler so that it can be mapped to a connection (if it is a TCP packet).
101         mTcpReassembler.gotPacket(packet);
102     }
103
104     /**
105      * Get the cluster that describes the packet sequence that this {@link Layer3ClusterMatcher} is searching for.
106      * @return the cluster that describes the packet sequence that this {@link Layer3ClusterMatcher} is searching for.
107      */
108     public List<List<PcapPacket>> getCluster() {
109         return mCluster;
110     }
111
112     public void performDetectionRangeBased() {
113         /*
114          * Let's start out simple by building a version that only works for signatures that do not span across multiple
115          * TCP conversations...
116          */
117         for (Conversation c : mTcpReassembler.getTcpConversations()) {
118             if (c.isTls() && c.getTlsApplicationDataPackets().isEmpty() || !c.isTls() && c.getPackets().isEmpty()) {
119                 // Skip empty conversations.
120                 continue;
121             }
122             List<PcapPacket> lowerBound = mCluster.get(0);
123             List<PcapPacket> upperBound = mCluster.get(1);
124             if (isTlsSequence(lowerBound) != c.isTls() || isTlsSequence(upperBound) != c.isTls()) {
125                 // We consider it a mismatch if one is a TLS application data sequence and the other is not.
126                 continue;
127             }
128             // Fetch set of packets to examine based on TLS or not.
129             List<PcapPacket> cPkts = c.isTls() ? c.getTlsApplicationDataPackets() : c.getPackets();
130             Optional<List<PcapPacket>> match;
131             while ((match = findSubsequenceInSequence(lowerBound, upperBound, cPkts, mClusterMemberDirections, null)).
132                     isPresent()) {
133                 List<PcapPacket> matchSeq = match.get();
134                 // Notify observers about the match.
135                 // Max number of skipped packets in layer 3 is 0 (no skipped packets)
136                 mObservers.forEach(o -> o.onMatch(Layer3ClusterMatcher.this, matchSeq));
137                 /*
138                  * Get the index in cPkts of the last packet in the sequence of packets that matches the searched
139                  * signature sequence.
140                  */
141                 int matchSeqEndIdx = cPkts.indexOf(matchSeq.get(matchSeq.size() - 1));
142                 // We restart the search for the signature sequence immediately after that index, so truncate cPkts.
143                 cPkts = cPkts.stream().skip(matchSeqEndIdx + 1).collect(Collectors.toList());
144             }
145         }
146     }
147
148     public void performDetectionConservative() {
149         /*
150          * Let's start out simple by building a version that only works for signatures that do not span across multiple
151          * TCP conversations...
152          */
153         for (Conversation c : mTcpReassembler.getTcpConversations()) {
154             if (c.isTls() && c.getTlsApplicationDataPackets().isEmpty() || !c.isTls() && c.getPackets().isEmpty()) {
155                 // Skip empty conversations.
156                 continue;
157             }
158             for (List<PcapPacket> signatureSequence : mCluster) {
159                 if (isTlsSequence(signatureSequence) != c.isTls()) {
160                     // We consider it a mismatch if one is a TLS application data sequence and the other is not.
161                     continue;
162                 }
163                 // Fetch set of packets to examine based on TLS or not.
164                 List<PcapPacket> cPkts = c.isTls() ? c.getTlsApplicationDataPackets() : c.getPackets();
165                 /*
166                  * Note: we embed the attempt to detect the signature sequence in a loop in order to capture those cases
167                  * where the same signature sequence appears multiple times in one Conversation.
168                  *
169                  * Note: since we expect all sequences that together make up the signature to exhibit the same direction
170                  * pattern, we can simply pass the precomputed direction array for the signature sequence so that it
171                  * won't have to be recomputed internally in each call to findSubsequenceInSequence().
172                  */
173                 Optional<List<PcapPacket>> match;
174                 while ((match = findSubsequenceInSequence(signatureSequence, cPkts, mClusterMemberDirections, null)).
175                         isPresent()) {
176                     List<PcapPacket> matchSeq = match.get();
177                     // Notify observers about the match.
178                     // Max number of skipped packets in layer 3 is 0 (no skipped packets)
179                     mObservers.forEach(o -> o.onMatch(Layer3ClusterMatcher.this, matchSeq));
180                     /*
181                      * Get the index in cPkts of the last packet in the sequence of packets that matches the searched
182                      * signature sequence.
183                      */
184                     int matchSeqEndIdx = cPkts.indexOf(matchSeq.get(matchSeq.size() - 1));
185                     // We restart the search for the signature sequence immediately after that index, so truncate cPkts.
186                     cPkts = cPkts.stream().skip(matchSeqEndIdx + 1).collect(Collectors.toList());
187                 }
188             }
189
190             /*
191              * TODO:
192              * if no item in cluster matches, also perform a distance-based matching to cover those cases where we did
193              * not manage to capture every single mutation of the sequence during training.
194              *
195              * Need to compute average/centroid of cluster to do so...? Compute within-cluster variance, then check if
196              * distance between input conversation and cluster average/centroid is smaller than or equal to the computed
197              * variance?
198              */
199         }
200     }
201
202     /**
203      * Checks if {@code sequence} is a sequence of TLS packets. Note: the current implementation relies on inspection
204      * of the port numbers when deciding between TLS vs. non-TLS. Therefore, only the first packet of {@code sequence}
205      * is examined as it is assumed that all packets in {@code sequence} pertain to the same {@link Conversation} and
206      * hence share the same set of two src/dst port numbers (albeit possibly alternating between which one is the src
207      * and which one is the dst, as packets in {@code sequence} may be in alternating directions).
208      * @param sequence The sequence of packets for which it is to be determined if it is a sequence of TLS packets or
209      *                 non-TLS packets.
210      * @return {@code true} if {@code sequence} is a sequence of TLS packets, {@code false} otherwise.
211      */
212     private boolean isTlsSequence(List<PcapPacket> sequence) {
213         // NOTE: Assumes ALL packets in sequence pertain to the same TCP connection!
214         PcapPacket firstPkt = sequence.get(0);
215         int srcPort = getSourcePort(firstPkt);
216         int dstPort = getDestinationPort(firstPkt);
217         return TcpConversationUtils.isTlsPort(srcPort) || TcpConversationUtils.isTlsPort(dstPort);
218     }
219
220     /**
221      * Examine if a given sequence of packets ({@code sequence}) contains a given shorter sequence of packets
222      * ({@code subsequence}). Note: the current implementation actually searches for a substring as it does not allow
223      * for interleaving packets in {@code sequence} that are not in {@code subsequence}; for example, if
224      * {@code subsequence} consists of packet lengths [2, 3, 5] and {@code sequence} consists of  packet lengths
225      * [2, 3, 4, 5], the result will be that there is no match (because of the interleaving 4). If we are to allow
226      * interleaving packets, we need a modified version of
227      * <a href="https://stackoverflow.com/a/20545604/1214974">this</a>.
228      *
229      * @param subsequence The sequence to search for.
230      * @param sequence The sequence to search.
231      * @param subsequenceDirections The directions of packets in {@code subsequence} such that for all {@code i},
232      *                              {@code subsequenceDirections[i]} is the direction of the packet returned by
233      *                              {@code subsequence.get(i)}. May be set to {@code null}, in which this call will
234      *                              internally compute the packet directions.
235      * @param sequenceDirections The directions of packets in {@code sequence} such that for all {@code i},
236      *                           {@code sequenceDirections[i]} is the direction of the packet returned by
237      *                           {@code sequence.get(i)}. May be set to {@code null}, in which this call will internally
238      *                           compute the packet directions.
239      *
240      * @return An {@link Optional} containing the part of {@code sequence} that matches {@code subsequence}, or an empty
241      *         {@link Optional} if no part of {@code sequence} matches {@code subsequence}.
242      */
243     private Optional<List<PcapPacket>> findSubsequenceInSequence(List<PcapPacket> subsequence,
244                                                                  List<PcapPacket> sequence,
245                                                                  Conversation.Direction[] subsequenceDirections,
246                                                                  Conversation.Direction[] sequenceDirections) {
247         if (sequence.size() < subsequence.size()) {
248             // If subsequence is longer, it cannot be contained in sequence.
249             return Optional.empty();
250         }
251         if (isTlsSequence(subsequence) != isTlsSequence(sequence)) {
252             // We consider it a mismatch if one is a TLS application data sequence and the other is not.
253             return Optional.empty();
254         }
255         // If packet directions have not been precomputed by calling code, we need to construct them.
256         if (subsequenceDirections == null) {
257             subsequenceDirections = getPacketDirections(subsequence, mRouterWanIp);
258         }
259         if (sequenceDirections == null) {
260             sequenceDirections = getPacketDirections(sequence, mRouterWanIp);
261         }
262         int subseqIdx = 0;
263         int seqIdx = 0;
264         while (seqIdx < sequence.size()) {
265             PcapPacket subseqPkt = subsequence.get(subseqIdx);
266             PcapPacket seqPkt = sequence.get(seqIdx);
267             // We only have a match if packet lengths and directions match.
268             if (subseqPkt.getOriginalLength() == seqPkt.getOriginalLength() &&
269                     subsequenceDirections[subseqIdx] == sequenceDirections[seqIdx]) {
270                 // A match; advance both indices to consider next packet in subsequence vs. next packet in sequence.
271                 subseqIdx++;
272                 seqIdx++;
273                 if (subseqIdx == subsequence.size()) {
274                     // We managed to match the entire subsequence in sequence.
275                     // Return the sublist of sequence that matches subsequence.
276                     /*
277                      * TODO:
278                      * ASSUMES THE BACKING LIST (i.e., 'sequence') IS _NOT_ STRUCTURALLY MODIFIED, hence may not work
279                      * for live traces!
280                      */
281                     return Optional.of(sequence.subList(seqIdx - subsequence.size(), seqIdx));
282                 }
283             } else {
284                 // Mismatch.
285                 if (subseqIdx > 0) {
286                     /*
287                      * If we managed to match parts of subsequence, we restart the search for subsequence in sequence at
288                      * the index of sequence where the current mismatch occurred. I.e., we must reset subseqIdx, but
289                      * leave seqIdx untouched.
290                      */
291                     subseqIdx = 0;
292                 } else {
293                     /*
294                      * First packet of subsequence didn't match packet at seqIdx of sequence, so we move forward in
295                      * sequence, i.e., we continue the search for subsequence in sequence starting at index seqIdx+1 of
296                      * sequence.
297                      */
298                     seqIdx++;
299                 }
300             }
301         }
302         return Optional.empty();
303     }
304
305     /**
306      * Overloading the method {@code findSubsequenceInSequence} for range-based matching. Instead of a sequence,
307      * we have sequences of lower and upper bounds.
308      *
309      * @param lowerBound The lower bound of the sequence we search for.
310      * @param upperBound The upper bound of the sequence we search for.
311      * @param subsequenceDirections The directions of packets in {@code subsequence} such that for all {@code i},
312      *                              {@code subsequenceDirections[i]} is the direction of the packet returned by
313      *                              {@code subsequence.get(i)}. May be set to {@code null}, in which this call will
314      *                              internally compute the packet directions.
315      * @param sequenceDirections The directions of packets in {@code sequence} such that for all {@code i},
316      *                           {@code sequenceDirections[i]} is the direction of the packet returned by
317      *                           {@code sequence.get(i)}. May be set to {@code null}, in which this call will internally
318      *                           compute the packet directions.
319      *
320      * @return An {@link Optional} containing the part of {@code sequence} that matches {@code subsequence}, or an empty
321      *         {@link Optional} if no part of {@code sequence} matches {@code subsequence}.
322      */
323     private Optional<List<PcapPacket>> findSubsequenceInSequence(List<PcapPacket> lowerBound,
324                                                                  List<PcapPacket> upperBound,
325                                                                  List<PcapPacket> sequence,
326                                                                  Conversation.Direction[] subsequenceDirections,
327                                                                  Conversation.Direction[] sequenceDirections) {
328         // Just do the checks for either lower or upper bound!
329         // TODO: For now we use just the lower bound
330         if (sequence.size() < lowerBound.size()) {
331             // If subsequence is longer, it cannot be contained in sequence.
332             return Optional.empty();
333         }
334         if (isTlsSequence(lowerBound) != isTlsSequence(sequence)) {
335             // We consider it a mismatch if one is a TLS application data sequence and the other is not.
336             return Optional.empty();
337         }
338         // If packet directions have not been precomputed by calling code, we need to construct them.
339         if (subsequenceDirections == null) {
340             subsequenceDirections = getPacketDirections(lowerBound, mRouterWanIp);
341         }
342         if (sequenceDirections == null) {
343             sequenceDirections = getPacketDirections(sequence, mRouterWanIp);
344         }
345         int subseqIdx = 0;
346         int seqIdx = 0;
347         while (seqIdx < sequence.size()) {
348             PcapPacket lowBndPkt = lowerBound.get(subseqIdx);
349             PcapPacket upBndPkt = upperBound.get(subseqIdx);
350             PcapPacket seqPkt = sequence.get(seqIdx);
351             // We only have a match if packet lengths and directions match.
352             // The packet lengths have to be in the range of [lowerBound - eps, upperBound+eps]
353             // We initialize the lower and upper bounds first
354             int epsLowerBound = lowBndPkt.length();
355             int epsUpperBound = upBndPkt.length();
356             // Do strict matching if the lower and upper bounds are the same length
357             // Do range matching with eps otherwise
358             if (epsLowerBound != epsUpperBound) {
359                 // TODO: Maybe we could do better here for the double to integer conversion?
360                 epsLowerBound = epsLowerBound - (int) mEps;
361                 epsUpperBound = epsUpperBound + (int) mEps;
362             }
363             if (epsLowerBound <= seqPkt.getOriginalLength() &&
364                     seqPkt.getOriginalLength() <= epsUpperBound &&
365                     subsequenceDirections[subseqIdx] == sequenceDirections[seqIdx]) {
366                 // A match; advance both indices to consider next packet in subsequence vs. next packet in sequence.
367                 subseqIdx++;
368                 seqIdx++;
369                 if (subseqIdx == lowerBound.size()) {
370                     // We managed to match the entire subsequence in sequence.
371                     // Return the sublist of sequence that matches subsequence.
372                     /*
373                      * TODO:
374                      * ASSUMES THE BACKING LIST (i.e., 'sequence') IS _NOT_ STRUCTURALLY MODIFIED, hence may not work
375                      * for live traces!
376                      */
377                     return Optional.of(sequence.subList(seqIdx - lowerBound.size(), seqIdx));
378                 }
379             } else {
380                 // Mismatch.
381                 if (subseqIdx > 0) {
382                     /*
383                      * If we managed to match parts of subsequence, we restart the search for subsequence in sequence at
384                      * the index of sequence where the current mismatch occurred. I.e., we must reset subseqIdx, but
385                      * leave seqIdx untouched.
386                      */
387                     subseqIdx = 0;
388                 } else {
389                     /*
390                      * First packet of subsequence didn't match packet at seqIdx of sequence, so we move forward in
391                      * sequence, i.e., we continue the search for subsequence in sequence starting at index seqIdx+1 of
392                      * sequence.
393                      */
394                     seqIdx++;
395                 }
396             }
397         }
398         return Optional.empty();
399     }
400
401     /**
402      * Given a cluster, produces a pruned version of that cluster. In the pruned version, there are no duplicate cluster
403      * members. Two cluster members are considered identical if their packets lengths and packet directions are
404      * identical. The resulting pruned cluster is unmodifiable (this applies to both the outermost list as well as the
405      * nested lists) in order to preserve its integrity when exposed to external code (e.g., through
406      * {@link #getCluster()}).
407      *
408      * @param cluster A cluster to prune.
409      * @return The resulting pruned cluster.
410      */
411     @Override
412     protected List<List<PcapPacket>> pruneCluster(List<List<PcapPacket>> cluster) {
413         List<List<PcapPacket>> prunedCluster = new ArrayList<>();
414         for (List<PcapPacket> originalClusterSeq : cluster) {
415             boolean alreadyPresent = false;
416             for (List<PcapPacket> prunedClusterSeq : prunedCluster) {
417                 Optional<List<PcapPacket>> duplicate = findSubsequenceInSequence(originalClusterSeq, prunedClusterSeq,
418                         mClusterMemberDirections, mClusterMemberDirections);
419                 if (duplicate.isPresent()) {
420                     alreadyPresent = true;
421                     break;
422                 }
423             }
424             if (!alreadyPresent) {
425                 prunedCluster.add(Collections.unmodifiableList(originalClusterSeq));
426             }
427         }
428         return Collections.unmodifiableList(prunedCluster);
429     }
430
431     /**
432      * Given a {@code List<PcapPacket>}, generate a {@code Conversation.Direction[]} such that each entry in the
433      * resulting {@code Conversation.Direction[]} specifies the direction of the {@link PcapPacket} at the corresponding
434      * index in the input list.
435      * @param packets The list of packets for which to construct a corresponding array of packet directions.
436      * @param routerWanIp The IP of the router's WAN port. This is used for determining the direction of packets when
437      *                    the traffic is captured just outside the local network (at the ISP side of the router). Set to
438      *                    {@code null} if {@code packets} stem from traffic captured within the local network.
439      * @return A {@code Conversation.Direction[]} specifying the direction of the {@link PcapPacket} at the
440      *         corresponding index in {@code packets}.
441      */
442     private static Conversation.Direction[] getPacketDirections(List<PcapPacket> packets, String routerWanIp) {
443         Conversation.Direction[] directions = new Conversation.Direction[packets.size()];
444         for (int i = 0; i < packets.size(); i++) {
445             PcapPacket pkt = packets.get(i);
446             if (getSourceIp(pkt).equals(getDestinationIp(pkt))) {
447                 // Sanity check: we shouldn't be processing loopback traffic
448                 throw new AssertionError("loopback traffic detected");
449             }
450             if (isSrcIpLocal(pkt) || getSourceIp(pkt).equals(routerWanIp)) {
451                 directions[i] = Conversation.Direction.CLIENT_TO_SERVER;
452             } else if (isDstIpLocal(pkt) || getDestinationIp(pkt).equals(routerWanIp)) {
453                 directions[i] = Conversation.Direction.SERVER_TO_CLIENT;
454             } else {
455                 //throw new IllegalArgumentException("no local IP or router WAN port IP found, can't detect direction");
456             }
457         }
458         return directions;
459     }
460
461 }