c6119684cb270d192f8c1c4e6ba3135f5c3f6f2c
[pingpong.git] / Code / Projects / SmartPlugDetector / src / main / java / edu / uci / iotproject / comparison / seqalignment / SequenceExtraction.java
1 package edu.uci.iotproject.comparison.seqalignment;
2
3 import edu.uci.iotproject.Conversation;
4 import edu.uci.iotproject.analysis.TcpConversationUtils;
5
6 import java.util.Comparator;
7 import java.util.List;
8 import java.util.Map;
9 import java.util.stream.Collectors;
10
11 /**
12  * TODO add class documentation.
13  *
14  * @author Janus Varmarken
15  */
16 public class SequenceExtraction {
17
18
19     private final SequenceAlignment<Integer> mAlignmentAlg;
20
21
22     public SequenceExtraction() {
23         mAlignmentAlg = new SequenceAlignment<>(new AlignmentPricer<>((i1,i2) -> Math.abs(i1-i2), i -> 10));
24     }
25
26
27     public SequenceExtraction(SequenceAlignment<Integer> alignmentAlgorithm) {
28         mAlignmentAlg = alignmentAlgorithm;
29     }
30
31     /**
32      * Gets the {@link SequenceAlignment} used to perform the sequence extraction.
33      * @return the {@link SequenceAlignment} used to perform the sequence extraction.
34      */
35     public SequenceAlignment<Integer> getAlignmentAlgorithm() {
36         return mAlignmentAlg;
37     }
38
39     // Initial
40 //    /**
41 //     *
42 //     * @param convsForAction A set of {@link Conversation}s known to be associated with a single type of user action.
43 //     */
44 //    public void extract(List<Conversation> convsForAction) {
45 //        int maxDifference = 0;
46 //
47 //        for (int i = 0; i < convsForAction.size(); i++) {
48 //            for (int j = i+1; j < convsForAction.size(); i++) {
49 //                Integer[] sequence1 = getPacketLengthSequence(convsForAction.get(i));
50 //                Integer[] sequence2 = getPacketLengthSequence(convsForAction.get(j));
51 //                int alignmentCost = mAlignmentAlg.calculateAlignment(sequence1, sequence2);
52 //                if (alignmentCost > maxDifference) {
53 //                    maxDifference = alignmentCost;
54 //                }
55 //            }
56 //        }
57 //
58 //    }
59
60
61 //    public void extract(Map<String, List<Conversation>> hostnameToConvs) {
62 //        int maxDifference = 0;
63 //
64 //        for (int i = 0; i < convsForAction.size(); i++) {
65 //            for (int j = i+1; j < convsForAction.size(); i++) {
66 //                Integer[] sequence1 = getPacketLengthSequence(convsForAction.get(i));
67 //                Integer[] sequence2 = getPacketLengthSequence(convsForAction.get(j));
68 //                int alignmentCost = mAlignmentAlg.calculateAlignment(sequence1, sequence2);
69 //                if (alignmentCost > maxDifference) {
70 //                    maxDifference = alignmentCost;
71 //                }
72 //            }
73 //        }
74 //
75 //    }
76
77     // Building signature from entire sequence
78     public ExtractedSequence extract(List<Conversation> convsForActionForHostname) {
79         // First group conversations by packet sequences.
80         // TODO: the introduction of SYN/SYNACK, FIN/FINACK and RST as part of the sequence ID may be undesirable here
81         // as it can potentially result in sequences that are equal in terms of payload packets to be considered
82         // different due to differences in how they are terminated.
83         Map<String, List<Conversation>> groupedBySequence =
84                 TcpConversationUtils.groupConversationsByPacketSequence(convsForActionForHostname, false);
85
86         // Then get a hold of one of the conversations that gave rise to the most frequent sequence.
87         Conversation mostFrequentConv = null;
88         int maxFrequency = 0;
89         for (Map.Entry<String, List<Conversation>> seqMapEntry : groupedBySequence.entrySet()) {
90             if (seqMapEntry.getValue().size() > maxFrequency) {
91                 // Found a more frequent sequence
92                 maxFrequency = seqMapEntry.getValue().size();
93                 // We just pick the first conversation as the representative conversation for this sequence type.
94                 mostFrequentConv = seqMapEntry.getValue().get(0);
95             } else if (seqMapEntry.getValue().size() == maxFrequency) {
96                 // This sequence has the same frequency as the max frequency seen so far.
97                 // Break ties by choosing the longest sequence.
98                 // First get an arbitrary representative of currently examined sequence; we just pick the first.
99                 Conversation c = seqMapEntry.getValue().get(0);
100                 mostFrequentConv = c.getPackets().size() > mostFrequentConv.getPackets().size() ? c : mostFrequentConv;
101             }
102         }
103         // Now find the maximum cost of aligning the most frequent (or, alternatively longest) conversation with the
104         // each of the rest of the conversations also associated with this action and hostname.
105         int maxCost = 0;
106         final Integer[] mostFrequentConvSeq = TcpConversationUtils.getPacketLengthSequence(mostFrequentConv);
107         for (Conversation c : convsForActionForHostname) {
108             if (c == mostFrequentConv) {
109                 // Don't compute distance to self.
110                 continue;
111             }
112             Integer[] cSeq = TcpConversationUtils.getPacketLengthSequence(c);
113             int alignmentCost = mAlignmentAlg.calculateAlignment(mostFrequentConvSeq, cSeq);
114             if (alignmentCost > maxCost) {
115                 maxCost = alignmentCost;
116             }
117         }
118         return new ExtractedSequence(mostFrequentConv, maxCost, false);
119     }
120
121     // Building signature from only TLS Application Data packets
122     public ExtractedSequence extractByTlsAppData(List<Conversation> convsForActionForHostname) {
123         // TODO: temporary hack to avoid 97-only conversations for dlink plug. We need some preprocessing/data cleaning.
124         convsForActionForHostname = convsForActionForHostname.stream().filter(c -> c.getTlsApplicationDataPackets().size() > 1).collect(Collectors.toList());
125
126         Map<String, List<Conversation>> groupedByTlsAppDataSequence =
127                 TcpConversationUtils.groupConversationsByTlsApplicationDataPacketSequence(convsForActionForHostname);
128         // Get a Conversation representing the most frequent TLS application data sequence.
129         Conversation mostFrequentConv = groupedByTlsAppDataSequence.values().stream().max((l1, l2) -> {
130             // The frequency of a conversation with a specific packet sequence is the list size as that represents how
131             // many conversations exhibit that packet sequence.
132             // Hence, the difference between the list sizes can be used directly as the return value of the Comparator.
133             // Note: we break ties by choosing the one with the most TLS application data packets (i.e., the longest
134             // sequence) in case the frequencies are equal.
135             int diff = l1.size() - l2.size();
136             return diff != 0 ? diff : l1.get(0).getTlsApplicationDataPackets().size() - l2.get(0).getTlsApplicationDataPackets().size();
137         }).get().get(0); // Just pick the first as a representative of the most frequent sequence.
138         // Lengths of TLS Application Data packets in the most frequent (or most frequent and longest) conversation.
139         Integer[] mostFreqSeq = TcpConversationUtils.getPacketLengthSequenceTlsAppDataOnly(mostFrequentConv);
140         // Now find the maximum cost of aligning the most frequent (or, alternatively longest) conversation with the
141         // each of the rest of the conversations also associated with this action and hostname.
142         int maxCost = 0;
143         for (Conversation c : convsForActionForHostname) {
144             if (c == mostFrequentConv) continue;
145             int cost = mAlignmentAlg.calculateAlignment(mostFreqSeq, TcpConversationUtils.getPacketLengthSequenceTlsAppDataOnly(c));
146             maxCost = cost > maxCost ? cost : maxCost;
147         }
148         return new ExtractedSequence(mostFrequentConv, maxCost, true);
149         // Now find the maximum cost of aligning the most frequent (or, alternatively longest) conversation with the
150         // each of the rest of the conversations also associated with this action and hostname.
151     }
152
153 }