Checking in a new script that allows specifying device identification to run experime...
[pingpong.git] / Code / Projects / PacketLevelSignatureExtractor / src / main / java / edu / uci / iotproject / SignatureGenerator.java
1 package edu.uci.iotproject;
2
3 import static edu.uci.iotproject.analysis.UserAction.Type;
4
5 import edu.uci.iotproject.analysis.*;
6 import edu.uci.iotproject.io.PrintWriterUtils;
7 import edu.uci.iotproject.io.TriggerTimesFileReader;
8 import edu.uci.iotproject.trafficreassembly.layer3.Conversation;
9 import edu.uci.iotproject.trafficreassembly.layer3.TcpReassembler;
10 import edu.uci.iotproject.util.PcapPacketUtils;
11 import edu.uci.iotproject.util.PrintUtils;
12 import org.apache.commons.math3.stat.clustering.Cluster;
13 import org.apache.commons.math3.stat.clustering.DBSCANClusterer;
14 import org.pcap4j.core.*;
15 import org.pcap4j.packet.namednumber.DataLinkType;
16
17 import java.io.*;
18 import java.net.UnknownHostException;
19 import java.time.Duration;
20 import java.time.Instant;
21 import java.util.*;
22 import java.util.concurrent.TimeoutException;
23 import java.util.stream.Collectors;
24 import java.util.stream.Stream;
25
26 /**
27  * This is a system that reads PCAP files to compare
28  * patterns of DNS hostnames, packet sequences, and packet
29  * lengths with training data to determine certain events
30  * or actions for smart home devices.
31  *
32  * @author Janus Varmarken
33  * @author Rahmadi Trimananda (rtrimana@uci.edu)
34  * @version 0.1
35  */
36 public class SignatureGenerator {
37
38     /**
39      * If set to {@code true}, output written to the results file is also dumped to standard out.
40      */
41     private static boolean DUPLICATE_OUTPUT_TO_STD_OUT = true;
42     /**
43      * File name for logging.
44      */
45     private static String LOG_EXTENSION = "_signature-generation.log";
46     /**
47      * Directory for logging.
48      */
49     private static String LOG_DIRECTORY = "./";
50
51     public static void main(String[] args) throws PcapNativeException, NotOpenException, EOFException,
52             TimeoutException, UnknownHostException, IOException {
53         // -------------------------------------------------------------------------------------------------------------
54         // ------------ # Code for extracting traffic generated by a device within x seconds of a trigger # ------------
55         if (args.length < 11) {
56             String errMsg = String.format("SPECTO version 1.0\n" +
57                             "Copyright (C) 2018-2019 Janus Varmarken and Rahmadi Trimananda.\n" +
58                             "University of California, Irvine.\n" +
59                             "All rights reserved.\n\n" +
60                             "Usage: %s inputPcapFile outputPcapFile triggerTimesFile deviceIp" +
61                             " onSignatureFile offSignatureFile onClusterAnalysisFile offClusterAnalysisFile epsilon" +
62                             " deletedSequencesOn deletedSequencesOff" +
63                             "\n  inputPcapFile: the target of the detection" +
64                             "\n  outputPcapFile: the processed PCAP file through 15-second window filtering" +
65                             "\n  triggerTimesFile: the trigger timestamps" +
66                             "\n  deviceIp: the IP address of the device we want to generate a signature for" +
67                             "\n  onSignatureFile: name of the ON signature file" +
68                             "\n  offSignatureFile: name of the OFF signature file" +
69                             "\n  onClusterAnalysisFile: name of the ON signature cluster analysis file" +
70                             "\n  offClusterAnalysisFile: name of the OFF signature cluster analysis file" +
71                             "\n  epsilon: epsilon value of the DBSCAN algorithm" +
72                             "\n  deletedSequencesOn: sequences to be deleted from the final ON signature" +
73                             " (please separate with commas, e.g., 0,1,2, or put '-1' if not needed)" +
74                             "\n  deletedSequencesOff: sequences to be deleted from the final OFF signature" +
75                             " (please separate with commas, e.g., 0,1,2, or put '-1' if not needed)",
76                     SignatureGenerator.class.getSimpleName());
77             System.out.println(errMsg);
78             return;
79         }
80         boolean verbose = true;
81         final String inputPcapFile = args[0];
82         final String outputPcapFile = args[1];
83         final String triggerTimesFile = args[2];
84         final String deviceIp = args[3];
85         final String onSignatureFile = args[4];
86         final String offSignatureFile = args[5];
87         final String onClusterAnalysisFile = args[6];
88         final String offClusterAnalysisFile = args[7];
89         final double eps = Double.parseDouble(args[8]);
90         final String deletedSequencesOn = args[9];
91         final String deletedSequencesOff = args[10];
92         final String logFile = inputPcapFile + LOG_EXTENSION;
93
94         // Prepare file outputter.
95         File outputFile = new File(logFile);
96         outputFile.getParentFile().mkdirs();
97         final PrintWriter resultsWriter = new PrintWriter(new FileWriter(outputFile));
98
99         // =========================================== TRAFFIC FILTERING ============================================
100
101         TriggerTimesFileReader ttfr = new TriggerTimesFileReader();
102         List<Instant> triggerTimes = ttfr.readTriggerTimes(triggerTimesFile, false);
103         // Tag each trigger with "ON" or "OFF", assuming that the first trigger is an "ON" and that they alternate.
104         List<UserAction> userActions = new ArrayList<>();
105         for (int i = 0; i < triggerTimes.size(); i++) {
106             userActions.add(new UserAction(i % 2 == 0 ? Type.TOGGLE_ON : Type.TOGGLE_OFF, triggerTimes.get(i)));
107         }
108         TriggerTrafficExtractor tte = new TriggerTrafficExtractor(inputPcapFile, triggerTimes, deviceIp);
109         final PcapDumper outputter = Pcaps.openDead(DataLinkType.EN10MB, 65536).dumpOpen(outputPcapFile);
110         DnsMap dnsMap = new DnsMap();
111         TcpReassembler tcpReassembler = new TcpReassembler();
112         TrafficLabeler trafficLabeler = new TrafficLabeler(userActions);
113         tte.performExtraction(pkt -> {
114             try {
115                 outputter.dump(pkt);
116             } catch (NotOpenException e) {
117                 e.printStackTrace();
118             }
119         }, dnsMap, tcpReassembler, trafficLabeler);
120         outputter.flush();
121         outputter.close();
122
123         if (tte.getPacketsIncludedCount() != trafficLabeler.getTotalPacketCount()) {
124             // Sanity/debug check
125             throw new AssertionError(String.format("mismatch between packet count in %s and %s",
126                     TriggerTrafficExtractor.class.getSimpleName(), TrafficLabeler.class.getSimpleName()));
127         }
128
129         // Extract all conversations present in the filtered trace.
130         List<Conversation> allConversations = tcpReassembler.getTcpConversations();
131         // Group conversations by hostname.
132         Map<String, List<Conversation>> convsByHostname =
133                 TcpConversationUtils.groupConversationsByHostname(allConversations, dnsMap);
134         PrintWriterUtils.println("Grouped conversations by hostname.", resultsWriter, DUPLICATE_OUTPUT_TO_STD_OUT);
135         // For each hostname, count the frequencies of packet lengths exchanged with that hostname.
136         final Map<String, Map<Integer, Integer>> pktLenFreqsByHostname = new HashMap<>();
137         convsByHostname.forEach((host, convs) -> pktLenFreqsByHostname.put(host,
138                 TcpConversationUtils.countPacketLengthFrequencies(convs)));
139         PrintWriterUtils.println("Counted frequencies of packet lengths exchanged with each hostname.",
140                 resultsWriter, DUPLICATE_OUTPUT_TO_STD_OUT);
141         // For each hostname, count the frequencies of packet sequences (i.e., count how many
142         // conversations exchange a sequence of packets of some specific lengths).
143         final Map<String, Map<String, Integer>> pktSeqFreqsByHostname = new HashMap<>();
144         convsByHostname.forEach((host, convs) -> pktSeqFreqsByHostname.put(host,
145                 TcpConversationUtils.countPacketSequenceFrequencies(convs)));
146         PrintWriterUtils.println("Counted frequencies of packet sequences exchanged with each hostname.",
147                 resultsWriter, DUPLICATE_OUTPUT_TO_STD_OUT);
148         // For each hostname, count frequencies of packet pairs exchanged
149         // with that hostname across all conversations
150         final Map<String, Map<String, Integer>> pktPairFreqsByHostname =
151                 TcpConversationUtils.countPacketPairFrequenciesByHostname(allConversations, dnsMap);
152         PrintWriterUtils.println("Counted frequencies of packet pairs per hostname.",
153                 resultsWriter, DUPLICATE_OUTPUT_TO_STD_OUT);
154         // For each user action, reassemble the set of TCP connections occurring shortly after
155         final Map<UserAction, List<Conversation>> userActionToConversations =
156                 trafficLabeler.getLabeledReassembledTcpTraffic();
157         final Map<UserAction, Map<String, List<Conversation>>> userActionsToConvsByHostname =
158                 trafficLabeler.getLabeledReassembledTcpTraffic(dnsMap);
159         PrintWriterUtils.println("Reassembled TCP conversations occurring shortly after each user event.",
160                 resultsWriter, DUPLICATE_OUTPUT_TO_STD_OUT);
161
162         /*
163          * NOTE: no need to generate these more complex on/off maps that also contain mappings from hostname and
164          * sequence identifiers as we do not care about hostnames and sequences during clustering.
165          * We can simply use the UserAction->List<Conversation> map to generate ON/OFF groupings of conversations.
166          */
167         // Contains all ON events: hostname -> sequence identifier -> list of conversations with that sequence
168         Map<String, Map<String, List<Conversation>>> ons = new HashMap<>();
169         // Contains all OFF events: hostname -> sequence identifier -> list of conversations with that sequence
170         Map<String, Map<String, List<Conversation>>> offs = new HashMap<>();
171         userActionsToConvsByHostname.forEach((ua, hostnameToConvs) -> {
172             Map<String, Map<String, List<Conversation>>> outer = ua.getType() == Type.TOGGLE_ON ? ons : offs;
173             hostnameToConvs.forEach((host, convs) -> {
174                 Map<String, List<Conversation>> seqsToConvs = TcpConversationUtils.
175                         groupConversationsByPacketSequence(convs, verbose);
176                 outer.merge(host, seqsToConvs, (oldMap, newMap) -> {
177                     newMap.forEach((sequence, cs) -> oldMap.merge(sequence, cs, (list1, list2) -> {
178                         list1.addAll(list2);
179                         return list1;
180                     }));
181                     return oldMap;
182                 });
183             });
184         });
185
186         // ============================================== PAIR CLUSTERING ============================================
187         // TODO: No need to use the more convoluted on/off maps; Can simply use the UserAction->List<Conversation> map
188         // TODO: when don't care about hostnames and sequences (see comment earlier).
189         // ===========================================================================================================
190         List<Conversation> onConversations = userActionToConversations.entrySet().stream().
191                 filter(e -> e.getKey().getType() == Type.TOGGLE_ON). // drop all OFF events from stream
192                 map(e -> e.getValue()). // no longer interested in the UserActions
193                 flatMap(List::stream). // flatten List<List<T>> to a List<T>
194                 collect(Collectors.toList());
195         List<Conversation> offConversations = userActionToConversations.entrySet().stream().
196                 filter(e -> e.getKey().getType() == Type.TOGGLE_OFF).
197                 map(e -> e.getValue()).
198                 flatMap(List::stream).
199                 collect(Collectors.toList());
200         //Collections.sort(onConversations, (c1, c2) -> c1.getPackets().)
201
202         List<PcapPacketPair> onPairs = onConversations.stream().
203                 map(c -> c.isTls() ? TcpConversationUtils.extractTlsAppDataPacketPairs(c) :
204                         TcpConversationUtils.extractPacketPairs(c)).
205                 flatMap(List::stream). // flatten List<List<>> to List<>
206                 collect(Collectors.toList());
207         List<PcapPacketPair> offPairs = offConversations.stream().
208                 map(c -> c.isTls() ? TcpConversationUtils.extractTlsAppDataPacketPairs(c) :
209                         TcpConversationUtils.extractPacketPairs(c)).
210                 flatMap(List::stream). // flatten List<List<>> to List<>
211                 collect(Collectors.toList());
212         // Note: need to update the DnsMap of all PcapPacketPairs if we want to use the IP/hostname-sensitive distance.
213         Stream.concat(Stream.of(onPairs), Stream.of(offPairs)).flatMap(List::stream).forEach(p -> p.setDnsMap(dnsMap));
214         // Perform clustering on conversation logged as part of all ON events.
215         // Calculate number of events per type (only ON/only OFF), which means half of the number of all timestamps.
216         int numberOfEventsPerType = triggerTimes.size() / 2;
217         int lowerBound = numberOfEventsPerType - (int)(numberOfEventsPerType * 0.1);
218         int upperBound = numberOfEventsPerType + (int)(numberOfEventsPerType * 0.1);
219         int minPts = lowerBound;
220         DBSCANClusterer<PcapPacketPair> onClusterer = new DBSCANClusterer<>(eps, minPts);
221         List<Cluster<PcapPacketPair>> onClusters = onClusterer.cluster(onPairs);
222         // Perform clustering on conversation logged as part of all OFF events.
223         DBSCANClusterer<PcapPacketPair> offClusterer = new DBSCANClusterer<>(eps, minPts);
224         List<Cluster<PcapPacketPair>> offClusters = offClusterer.cluster(offPairs);
225         // Sort the conversations as reference
226         List<Conversation> sortedAllConversation = TcpConversationUtils.sortConversationList(allConversations);
227         // Output clusters
228         PrintWriterUtils.println("========================================", resultsWriter,
229                 DUPLICATE_OUTPUT_TO_STD_OUT);
230         PrintWriterUtils.println("       Clustering results for ON        ", resultsWriter,
231                 DUPLICATE_OUTPUT_TO_STD_OUT);
232         PrintWriterUtils.println("       Number of clusters: " + onClusters.size(), resultsWriter,
233                 DUPLICATE_OUTPUT_TO_STD_OUT);
234         int count = 0;
235         List<List<List<PcapPacket>>> ppListOfListReadOn = new ArrayList<>();
236         List<List<List<PcapPacket>>> ppListOfListListOn = new ArrayList<>();
237         List<List<List<PcapPacket>>> corePointRangeSignatureOn = new ArrayList<>();
238         for (Cluster<PcapPacketPair> c : onClusters) {
239             PrintWriterUtils.println(String.format("<<< Cluster #%02d (%03d points) >>>", ++count, c.getPoints().size()),
240                     resultsWriter, DUPLICATE_OUTPUT_TO_STD_OUT);
241             PrintWriterUtils.println(PrintUtils.toSummaryString(c), resultsWriter, DUPLICATE_OUTPUT_TO_STD_OUT);
242             if(c.getPoints().size() > lowerBound && c.getPoints().size() < upperBound) {
243                 // Print to file
244                 List<List<PcapPacket>> ppListOfList = PcapPacketUtils.clusterToListOfPcapPackets(c);
245                 // Check for overlaps and decide whether to do range-based or conservative checking
246                 corePointRangeSignatureOn.add(PcapPacketUtils.extractRangeCorePoints(ppListOfList, eps, minPts));
247                 ppListOfListListOn.add(ppListOfList);
248             }
249         }
250         PrintWriterUtils.println("========================================", resultsWriter,
251                 DUPLICATE_OUTPUT_TO_STD_OUT);
252         PrintWriterUtils.println("       Clustering results for OFF        ", resultsWriter,
253                 DUPLICATE_OUTPUT_TO_STD_OUT);
254         PrintWriterUtils.println("       Number of clusters: " + offClusters.size(), resultsWriter,
255                 DUPLICATE_OUTPUT_TO_STD_OUT);
256         count = 0;
257         List<List<List<PcapPacket>>> ppListOfListReadOff = new ArrayList<>();
258         List<List<List<PcapPacket>>> ppListOfListListOff = new ArrayList<>();
259         List<List<List<PcapPacket>>> corePointRangeSignatureOff = new ArrayList<>();
260         for (Cluster<PcapPacketPair> c : offClusters) {
261             PrintWriterUtils.println(String.format("<<< Cluster #%03d (%06d points) >>>", ++count, c.getPoints().size()),
262                     resultsWriter, DUPLICATE_OUTPUT_TO_STD_OUT);
263             PrintWriterUtils.println(PrintUtils.toSummaryString(c), resultsWriter, DUPLICATE_OUTPUT_TO_STD_OUT);
264             if(c.getPoints().size() > lowerBound && c.getPoints().size() < upperBound) {
265                 // Print to file
266                 List<List<PcapPacket>> ppListOfList = PcapPacketUtils.clusterToListOfPcapPackets(c);
267                 // Check for overlaps and decide whether to do range-based or conservative checking
268                 corePointRangeSignatureOff.add(PcapPacketUtils.extractRangeCorePoints(ppListOfList, eps, minPts));
269                 ppListOfListListOff.add(ppListOfList);
270             }
271         }
272
273         // =========================================== SIGNATURE CREATION ===========================================
274         // Concatenate
275         ppListOfListListOn = PcapPacketUtils.concatSequences(ppListOfListListOn, sortedAllConversation);
276         // Remove sequences in the list that have overlap
277         StringTokenizer stringTokenizerOn = new StringTokenizer(deletedSequencesOn, ",");
278         while(stringTokenizerOn.hasMoreTokens()) {
279             int sequenceToDelete = Integer.parseInt(stringTokenizerOn.nextToken());
280             if (sequenceToDelete == -1) { // '-1' means there is no removal
281                 break;
282             }
283             PcapPacketUtils.removeSequenceFromSignature(ppListOfListListOn, sequenceToDelete);
284         }
285         ppListOfListListOn = PcapPacketUtils.sortSequences(ppListOfListListOn);
286
287         // Concatenate
288         ppListOfListListOff = PcapPacketUtils.concatSequences(ppListOfListListOff, sortedAllConversation);
289         // Remove sequences in the list that have overlap
290         StringTokenizer stringTokenizerOff = new StringTokenizer(deletedSequencesOff, ",");
291         while(stringTokenizerOff.hasMoreTokens()) {
292             int sequenceToDelete = Integer.parseInt(stringTokenizerOff.nextToken());
293             if (sequenceToDelete == -1) { // '-1' means there is no removal
294                 break;
295             }
296             PcapPacketUtils.removeSequenceFromSignature(ppListOfListListOff, sequenceToDelete);
297         }
298         ppListOfListListOff = PcapPacketUtils.sortSequences(ppListOfListListOff);
299         // Write the signatures into the screen
300         PrintWriterUtils.println("========================================", resultsWriter,
301                 DUPLICATE_OUTPUT_TO_STD_OUT);
302         PrintWriterUtils.println("              ON Signature              ", resultsWriter,
303                 DUPLICATE_OUTPUT_TO_STD_OUT);
304         PrintWriterUtils.println("========================================", resultsWriter,
305                 DUPLICATE_OUTPUT_TO_STD_OUT);
306         PcapPacketUtils.printSignatures(ppListOfListListOn, resultsWriter, DUPLICATE_OUTPUT_TO_STD_OUT);
307         PrintWriterUtils.println("========================================", resultsWriter,
308                 DUPLICATE_OUTPUT_TO_STD_OUT);
309         PrintWriterUtils.println("              OFF Signature             ", resultsWriter,
310                 DUPLICATE_OUTPUT_TO_STD_OUT);
311         PrintWriterUtils.println("========================================", resultsWriter,
312                 DUPLICATE_OUTPUT_TO_STD_OUT);
313         PcapPacketUtils.printSignatures(ppListOfListListOff, resultsWriter, DUPLICATE_OUTPUT_TO_STD_OUT);
314         // Printing signatures into files
315         PrintUtils.serializeIntoFile(onSignatureFile, ppListOfListListOn);
316         PrintUtils.serializeIntoFile(offSignatureFile, ppListOfListListOff);
317         // Printing cluster analyses into files
318         PrintUtils.serializeIntoFile(onClusterAnalysisFile, corePointRangeSignatureOn);
319         PrintUtils.serializeIntoFile(offClusterAnalysisFile, corePointRangeSignatureOff);
320
321         // =========================================== SIGNATURE DURATION ===========================================
322         List<Instant> firstSignatureTimestamps = new ArrayList<>();
323         List<Instant> lastSignatureTimestamps = new ArrayList<>();
324         if (!ppListOfListListOn.isEmpty()) {
325             List<List<PcapPacket>> firstListOnSign = ppListOfListListOn.get(0);
326             List<List<PcapPacket>> lastListOnSign = ppListOfListListOn.get(ppListOfListListOn.size() - 1);
327             // Load ON signature first and last packet's timestamps
328             for (List<PcapPacket> list : firstListOnSign) {
329                 // Get timestamp Instant from the last packet
330                 firstSignatureTimestamps.add(list.get(0).getTimestamp());
331             }
332             for (List<PcapPacket> list : lastListOnSign) {
333                 // Get timestamp Instant from the last packet
334                 int lastPacketIndex = list.size() - 1;
335                 lastSignatureTimestamps.add(list.get(lastPacketIndex).getTimestamp());
336             }
337         }
338
339         if (!ppListOfListListOn.isEmpty()) {
340             List<List<PcapPacket>> firstListOffSign = ppListOfListListOff.get(0);
341             List<List<PcapPacket>> lastListOffSign = ppListOfListListOff.get(ppListOfListListOff.size() - 1);
342             // Load OFF signature first and last packet's timestamps
343             for (List<PcapPacket> list : firstListOffSign) {
344                 // Get timestamp Instant from the last packet
345                 firstSignatureTimestamps.add(list.get(0).getTimestamp());
346             }
347             for (List<PcapPacket> list : lastListOffSign) {
348                 // Get timestamp Instant from the last packet
349                 int lastPacketIndex = list.size() - 1;
350                 lastSignatureTimestamps.add(list.get(lastPacketIndex).getTimestamp());
351             }
352         }
353         // Sort the timestamps
354         firstSignatureTimestamps.sort(Comparator.comparing(Instant::toEpochMilli));
355         lastSignatureTimestamps.sort(Comparator.comparing(Instant::toEpochMilli));
356
357         Iterator<Instant> iterFirst = firstSignatureTimestamps.iterator();
358         Iterator<Instant> iterLast = lastSignatureTimestamps.iterator();
359         long duration;
360         long maxDuration = Long.MIN_VALUE;
361         PrintWriterUtils.println("========================================", resultsWriter,
362                 DUPLICATE_OUTPUT_TO_STD_OUT);
363         PrintWriterUtils.println("           Signature Durations          ", resultsWriter,
364                 DUPLICATE_OUTPUT_TO_STD_OUT);
365         PrintWriterUtils.println("========================================", resultsWriter,
366                 DUPLICATE_OUTPUT_TO_STD_OUT);
367         while (iterFirst.hasNext() && iterLast.hasNext()) {
368             Instant firstInst = iterFirst.next();
369             Instant lastInst = iterLast.next();
370             Duration dur = Duration.between(firstInst, lastInst);
371             duration = dur.toMillis();
372             // Check duration --- should be below 15 seconds
373             if (duration > TriggerTrafficExtractor.INCLUSION_WINDOW_MILLIS) {
374                 while (duration > TriggerTrafficExtractor.INCLUSION_WINDOW_MILLIS && iterFirst.hasNext()) {
375                     // that means we have to move to the next trigger
376                     firstInst = iterFirst.next();
377                     dur = Duration.between(firstInst, lastInst);
378                     duration = dur.toMillis();
379                 }
380             } else { // Below 0/Negative --- that means we have to move to the next signature
381                 while (duration < 0 && iterLast.hasNext()) {
382                     // that means we have to move to the next trigger
383                     lastInst = iterLast.next();
384                     dur = Duration.between(firstInst, lastInst);
385                     duration = dur.toMillis();
386                 }
387             }
388             PrintWriterUtils.println(duration, resultsWriter, DUPLICATE_OUTPUT_TO_STD_OUT);
389             // Update duration if this bigger than the max value and still less than the window inclusion time
390             maxDuration = maxDuration < duration && duration <= TriggerTrafficExtractor.INCLUSION_WINDOW_MILLIS ?
391                     duration : maxDuration;
392         }
393         // Just assign the value 0 if there is no signature
394         if (maxDuration == Long.MIN_VALUE) {
395             maxDuration = 0;
396         }
397         PrintWriterUtils.println("========================================", resultsWriter,
398                 DUPLICATE_OUTPUT_TO_STD_OUT);
399         PrintWriterUtils.println("Max signature duration: " + maxDuration, resultsWriter,
400                 DUPLICATE_OUTPUT_TO_STD_OUT);
401         PrintWriterUtils.println("========================================", resultsWriter,
402                 DUPLICATE_OUTPUT_TO_STD_OUT);
403         resultsWriter.flush();
404         resultsWriter.close();
405         // ==========================================================================================================
406     }
407 }