4 Script that constructs a graph in which hosts are nodes.
5 An edge between two hosts indicate that the hosts communicate.
6 Hosts are labeled and identified by their IPs.
7 The graph is written to a file in Graph Exchange XML format for later import and visual inspection in Gephi.
9 The input to this script is the JSON output by extract_from_tshark.py by Anastasia Shuba.
11 This script is a simplification of Milad Asgari's parser_data_to_gephi.py script.
12 It serves as a baseline for future scripts that want to include more information in the graph.
24 import parser.parse_dns
27 DEVICE_MAC_LIST = "devicelist.dat"
28 EXCLUSION_MAC_LIST = "exclusion.dat"
29 COLUMN_MAC = "MAC_address"
30 COLUMN_DEVICE_NAME = "device_name"
32 JSON_KEY_SOURCE = "_source"
33 JSON_KEY_LAYERS = "layers"
34 JSON_KEY_FRAME = "frame"
35 JSON_KEY_FRAME_PROTOCOLS = "frame.protocols"
36 JSON_KEY_FRAME_TIME_EPOCH = "frame.time_epoch"
37 JSON_KEY_FRAME_LENGTH = "frame.len"
39 JSON_KEY_ETH_SRC = "eth.src"
40 JSON_KEY_ETH_DST = "eth.dst"
42 JSON_KEY_IP_SRC = "ip.src"
43 JSON_KEY_IP_DST = "ip.dst"
47 # List of checked protocols
48 listchkprot = [ "arp",
57 def create_device_list(dev_list_file):
58 """ Create list for smart home devices from a CSV file
60 dev_list_file: CSV file path that contains list of device MAC addresses
62 # Open the device MAC list file
63 with open(dev_list_file) as csvfile:
64 mac_list = csv.DictReader(csvfile, (COLUMN_MAC, COLUMN_DEVICE_NAME))
67 crude_list.append(item)
68 # Create key-value dictionary
70 for item in crude_list:
71 dev_list[item[COLUMN_MAC]] = item[COLUMN_DEVICE_NAME]
72 #print item["MAC_address"] + " => " + item["device_name"]
73 #for key, value in devlist.iteritems():
74 # print key + " => " + value
79 def traverse_and_merge_nodes(G, dev_list_file):
80 """ Merge nodes that have similar properties, e.g. same protocols
81 But, we only do this for leaves (outer nodes), and not for
82 nodes that are in the middle/have many neighbors.
83 The pre-condition is that the node:
84 (1) only has one neighbor, and
85 (2) not a smarthome device.
86 then we compare the edges, whether they use the same protocols
87 or not. If yes, then we collapse that node and we attach
88 it to the very first node that uses that set of protocols.
90 G: a complete networkx graph
91 dev_list_file: CSV file path that contains list of device MAC addresses
94 #print "Nodes: ", nodes
95 node_to_merge = dict()
96 # Create list of smarthome devices
97 dev_list = create_device_list(DEVICE_MAC_LIST)
99 # Check that the node is not a smarthome device
101 neighbors = G.neighbors(node)
102 #print "Neighbors: ", neighbors, "\n"
103 # Skip if the node is a smarthome device
106 # Skip if the node has many neighbors (non-leaf) or no neighbor at all
107 if len(neighbors) is not 1:
109 #print "Node: ", node
110 neighbor = neighbors[0]
111 #print "Neighbor: ", neighbors
112 protocols = G[node][neighbor]['Protocol']
113 #print "Protocol: ", protocols
114 # Store neighbor-protocol as key in dictionary
115 neigh_proto = neighbor + "-" + protocols
116 if neigh_proto not in node_to_merge:
117 node_to_merge[neigh_proto] = node
119 # Merge this node if there is already an entry
122 node_to_merge_with = node_to_merge[neigh_proto]
123 merged_nodes = G.node[node_to_merge_with]['Merged']
124 # Check if this is the first node
125 if merged_nodes is '':
128 # Put comma if there is already one or more nodes
129 merged_nodes += ", " + node
130 # Then attach as attribute
131 G.node[node_to_merge_with]['Merged'] = merged_nodes
136 def place_in_graph(G, eth_src, eth_dst, device_dns_mappings, dev_list, layers,
137 edge_to_prot, edge_to_vol):
138 """ Place nodes and edges on the graph
140 G: the complete graph
141 eth_src: MAC address of source
142 eth_dst: MAC address of destination
143 device_dns_mappings: device to DNS mappings (data structure)
144 dev_list: list of existing smarthome devices
145 layers: layers of JSON file structure
146 edge_to_prot: edge to protocols mappings
147 edge_to_vol: edge to traffic volume mappings
149 # Get timestamp of packet (router's timestamp)
150 timestamp = Decimal(layers[JSON_KEY_FRAME][JSON_KEY_FRAME_TIME_EPOCH])
152 packet_len = Decimal(layers[JSON_KEY_FRAME][JSON_KEY_FRAME_LENGTH])
153 # Get the protocol and strip just the name of it
154 long_protocol = layers[JSON_KEY_FRAME][JSON_KEY_FRAME_PROTOCOLS]
155 # Split once starting from the end of the string and get it
156 split_protocol = long_protocol.split(':')
158 if len(split_protocol) < 5:
159 last_index = len(split_protocol) - 1
160 protocol = split_protocol[last_index]
162 protocol = split_protocol[3] + ":" + split_protocol[4]
163 print "timestamp: ", timestamp, " - new protocol added: ", protocol, "\n"
164 # Store protocol into the set (source)
166 # Key to search in the dictionary is <src-mac-address>-<dst-mac_address>
167 dict_key = eth_src + "-" + eth_dst
168 if dict_key not in edge_to_prot:
169 edge_to_prot[dict_key] = set()
170 protocols = edge_to_prot[dict_key]
171 protocols.add(protocol)
172 protocols_str = ', '.join(protocols)
173 print "protocols: ", protocols_str, "\n"
174 # Check packet length and accumulate to get traffic volume
175 if dict_key not in edge_to_vol:
176 edge_to_vol[dict_key] = 0;
177 edge_to_vol[dict_key] = edge_to_vol[dict_key] + packet_len
178 volume = str(edge_to_vol[dict_key])
179 # And source and destination IPs
180 ip_src = layers[JSON_KEY_IP][JSON_KEY_IP_SRC]
181 ip_dst = layers[JSON_KEY_IP][JSON_KEY_IP_DST]
182 # Categorize source and destination IP addresses: local vs. non-local
183 ip_re = re.compile(r'\b192.168.[0-9.]+')
184 src_is_local = ip_re.search(ip_src)
185 dst_is_local = ip_re.search(ip_dst)
186 print "ip.src =", ip_src, "ip.dst =", ip_dst, "\n"
187 # Place nodes and edges
191 G.add_node(eth_src, Name=dev_list[eth_src])
195 # Check first if the key (eth_dst) exists in the dictionary
196 if eth_dst in device_dns_mappings:
197 # If the source is not local, then it's inbound traffic, and hence the eth_dst is the MAC of the IoT device.
198 hostname = device_dns_mappings[eth_dst].hostname_for_ip_at_time(ip_src, timestamp)
200 # Use IP if no hostname mapping
202 # Non-smarthome devices can be merged later
203 G.add_node(hostname, Merged='')
207 G.add_node(eth_dst, Name=dev_list[eth_dst])
211 # Check first if the key (eth_dst) exists in the dictionary
212 if eth_src in device_dns_mappings:
213 # If the destination is not local, then it's outbound traffic, and hence the eth_src is the MAC of the IoT device.
214 hostname = device_dns_mappings[eth_src].hostname_for_ip_at_time(ip_dst, timestamp)
216 # Use IP if no hostname mapping
218 # Non-smarthome devices can be merged later
219 G.add_node(hostname, Merged='')
221 G.add_edge(src_node, dst_node, Protocol=protocols_str, Volume=volume)
224 def parse_json(file_path):
225 """ Parse JSON file and create graph
227 file_path: path to the JSON file
229 # Create a smart home device list
230 dev_list = create_device_list(DEVICE_MAC_LIST)
231 # Create an exclusion list
232 exc_list = create_device_list(EXCLUSION_MAC_LIST)
233 # First parse the file once, constructing a map that contains information about individual devices' DNS resolutions.
234 device_dns_mappings = parser.parse_dns.parse_json_dns(file_path) # "./json/eth1.dump.json"
237 # Mapping from edge to a set of protocols
238 edge_to_prot = dict()
239 # Mapping from edge to traffic volume
241 # Parse file again, this time constructing a graph of device<->server and device<->device communication.
242 with open(file_path) as jf:
243 # Read JSON; data becomes reference to root JSON object (or in our case json array)
245 # Loop through json objects (packets) in data
247 # p is a JSON object, not an index - drill down to object containing data from the different layers
248 layers = p[JSON_KEY_SOURCE][JSON_KEY_LAYERS]
251 for prot in listchkprot:
257 # Skip any non udp/non tcp traffic
258 if JSON_KEY_UDP not in layers and JSON_KEY_TCP not in layers:
261 # Fetch source and destination MACs
262 eth = layers.get(JSON_KEY_ETH, None)
264 print "[ WARNING: eth data not found ]"
266 eth_src = eth.get(JSON_KEY_ETH_SRC, None)
267 eth_dst = eth.get(JSON_KEY_ETH_DST, None)
268 # Exclude devices in the exclusion list
269 if eth_src in exc_list:
270 print "[ WARNING: Source ", eth_src, " is excluded from graph! ]"
272 if eth_dst in exc_list:
273 print "[ WARNING: Destination ", eth_dst, " is excluded from graph! ]"
276 # Place nodes and edges in graph
277 place_in_graph(G, eth_src, eth_dst, device_dns_mappings, dev_list, layers,
278 edge_to_prot, edge_to_vol)
280 # Print DNS mapping for reference
281 #for mac in device_dns_mappings:
282 # ddm = device_dns_mappings[mac]
283 # ddm.print_mappings()
288 # ------------------------------------------------------
289 # Not currently used.
290 # Might be useful later on if we wish to resolve IPs.
291 def get_domain(host):
292 ext_result = tldextract.extract(str(host))
293 # Be consistent with ReCon and keep suffix
294 domain = ext_result.domain + "." + ext_result.suffix
299 socket.inet_aton(addr)
303 # ------------------------------------------------------
306 if __name__ == '__main__':
307 if len(sys.argv) < 3:
308 print "Usage:", sys.argv[0], "input_file output_file"
309 print "outfile_file should end in .gexf"
311 # Input file: Path to JSON file generated from tshark JSON output using Anastasia's script (extract_from_tshark.py).
312 input_file = sys.argv[1]
313 print "[ input_file =", input_file, "]"
314 # Output file: Path to file where the Gephi XML should be written.
315 output_file = sys.argv[2]
316 print "[ output_file =", output_file, "]"
317 # Construct graph from JSON
318 G = parse_json(input_file)
319 # Contract nodes that have the same properties, i.e. same protocols
320 G = traverse_and_merge_nodes(G, DEVICE_MAC_LIST)
321 # Write Graph in Graph Exchange XML format
322 nx.write_gexf(G, output_file)