4 Script that constructs a graph in which hosts are nodes.
5 An edge between two hosts indicate that the hosts communicate.
6 Hosts are labeled and identified by their IPs.
7 The graph is written to a file in Graph Exchange XML format for later import and visual inspection in Gephi.
9 The input to this script is the JSON output by extract_from_tshark.py by Anastasia Shuba.
11 This script is a simplification of Milad Asgari's parser_data_to_gephi.py script.
12 It serves as a baseline for future scripts that want to include more information in the graph.
24 import parser.parse_dns
27 DEVICE_MAC_LIST = "devicelist.dat"
28 EXCLUSION_MAC_LIST = "exclusion.dat"
29 COLUMN_MAC = "MAC_address"
30 COLUMN_DEVICE_NAME = "device_name"
32 JSON_KEY_SOURCE = "_source"
33 JSON_KEY_LAYERS = "layers"
34 JSON_KEY_FRAME = "frame"
35 JSON_KEY_FRAME_PROTOCOLS = "frame.protocols"
36 JSON_KEY_FRAME_TIME_EPOCH = "frame.time_epoch"
38 JSON_KEY_ETH_SRC = "eth.src"
39 JSON_KEY_ETH_DST = "eth.dst"
41 JSON_KEY_IP_SRC = "ip.src"
42 JSON_KEY_IP_DST = "ip.dst"
46 # List of checked protocols
47 listchkprot = [ "arp",
56 def create_device_list(dev_list_file):
57 """ Create list for smart home devices from a CSV file
59 dev_list_file: CSV file path that contains list of device MAC addresses
61 # Open the device MAC list file
62 with open(dev_list_file) as csvfile:
63 mac_list = csv.DictReader(csvfile, (COLUMN_MAC, COLUMN_DEVICE_NAME))
66 crude_list.append(item)
67 # Create key-value dictionary
69 for item in crude_list:
70 dev_list[item[COLUMN_MAC]] = item[COLUMN_DEVICE_NAME]
71 #print item["MAC_address"] + " => " + item["device_name"]
72 #for key, value in devlist.iteritems():
73 # print key + " => " + value
78 def traverse_and_merge_nodes(G, dev_list_file):
79 """ Merge nodes that have similar properties, e.g. same protocols
80 But, we only do this for leaves (outer nodes), and not for
81 nodes that are in the middle/have many neighbors.
82 The pre-condition is that the node:
83 (1) only has one neighbor, and
84 (2) not a smarthome device.
85 then we compare the edges, whether they use the same protocols
86 or not. If yes, then we collapse that node and we attach
87 it to the very first node that uses that set of protocols.
89 G: a complete networkx graph
90 dev_list_file: CSV file path that contains list of device MAC addresses
93 #print "Nodes: ", nodes
94 node_to_merge = dict()
95 # Create list of smarthome devices
96 dev_list = create_device_list(DEVICE_MAC_LIST)
98 # Check that the node is not a smarthome device
100 neighbors = G.neighbors(node)
101 #print "Neighbors: ", neighbors, "\n"
102 # Skip if the node is a smarthome device
105 # Skip if the node has many neighbors (non-leaf) or no neighbor at all
106 if len(neighbors) is not 1:
108 #print "Node: ", node
109 neighbor = neighbors[0]
110 #print "Neighbor: ", neighbors
111 protocols = G[node][neighbor]['Protocol']
112 #print "Protocol: ", protocols
113 # Store neighbor-protocol as key in dictionary
114 neigh_proto = neighbor + "-" + protocols
115 if neigh_proto not in node_to_merge:
116 node_to_merge[neigh_proto] = node
118 # Merge this node if there is already an entry
121 node_to_merge_with = node_to_merge[neigh_proto]
122 merged_nodes = G.node[node_to_merge_with]['Merged']
123 # Check if this is the first node
124 if merged_nodes is '':
127 # Put comma if there is already one or more nodes
128 merged_nodes += ", " + node
129 # Then attach as attribute
130 G.node[node_to_merge_with]['Merged'] = merged_nodes
135 def parse_json(file_path):
136 """ Parse JSON file and create graph
138 file_path: path to the JSON file
140 # Create a smart home device list
141 dev_list = create_device_list(DEVICE_MAC_LIST)
142 # Create an exclusion list
143 exc_list = create_device_list(EXCLUSION_MAC_LIST)
145 # First parse the file once, constructing a map that contains information about individual devices' DNS resolutions.
146 device_dns_mappings = parser.parse_dns.parse_json_dns(file_path) # "./json/eth1.dump.json"
150 # Mapping from node to a set of protocols
151 edge_to_prot = dict()
153 # Parse file again, this time constructing a graph of device<->server and device<->device communication.
154 with open(file_path) as jf:
156 # data becomes reference to root JSON object (or in our case json array)
159 # Loop through json objects (packets) in data
161 # p is a JSON object, not an index
162 # Drill down to object containing data from the different layers
163 layers = p[JSON_KEY_SOURCE][JSON_KEY_LAYERS]
166 for prot in listchkprot:
172 # Skip any non udp/non tcp traffic
173 if JSON_KEY_UDP not in layers and JSON_KEY_TCP not in layers:
176 # Fetch source and destination MACs
177 eth = layers.get(JSON_KEY_ETH, None)
179 print "[ WARNING: eth data not found ]"
181 eth_src = eth.get(JSON_KEY_ETH_SRC, None)
182 eth_dst = eth.get(JSON_KEY_ETH_DST, None)
183 # Exclude devices in the exclusion list
184 if eth_src in exc_list:
185 print "[ WARNING: Source ", eth_src, " is excluded from graph! ]"
187 if eth_dst in exc_list:
188 print "[ WARNING: Destination ", eth_dst, " is excluded from graph! ]"
191 # Fetch timestamp of packet (router's timestamp)
192 timestamp = Decimal(layers[JSON_KEY_FRAME][JSON_KEY_FRAME_TIME_EPOCH])
193 # Get the protocol and strip just the name of it
194 long_protocol = layers[JSON_KEY_FRAME][JSON_KEY_FRAME_PROTOCOLS]
195 # Split once starting from the end of the string and get it
196 #protocol = long_protocol.rsplit(':', 1)[1]
197 split_protocol = long_protocol.split(':')
199 if len(split_protocol) < 5:
200 last_index = len(split_protocol) - 1
201 protocol = split_protocol[last_index]
203 protocol = split_protocol[3] + ":" + split_protocol[4]
204 print "timestamp: ", timestamp, " - new protocol added: ", protocol, "\n"
206 # Store protocol into the set (source)
208 # Key to search for protocol list in the dictionary is
209 # <src-mac-address>-<dst-mac_address>
210 protocol_key = eth_src + "-" + eth_dst
211 if protocol_key not in edge_to_prot:
212 edge_to_prot[protocol_key] = set()
213 protocols = edge_to_prot[protocol_key]
214 protocols.add(protocol)
215 protocols_str = ', '.join(protocols)
216 print "protocols: ", protocols_str, "\n"
217 # And source and destination IPs
218 ip_src = layers[JSON_KEY_IP][JSON_KEY_IP_SRC]
219 ip_dst = layers[JSON_KEY_IP][JSON_KEY_IP_DST]
221 # Categorize source and destination IP addresses: local vs. non-local
222 ipre = re.compile(r'\b192.168.[0-9.]+')
223 src_is_local = ipre.search(ip_src)
224 dst_is_local = ipre.search(ip_dst)
225 print "ip.src =", ip_src, "ip.dst =", ip_dst, "\n"
230 G.add_node(eth_src, Name=dev_list[eth_src])
234 # Check first if the key (eth_dst) exists in the dictionary
235 if eth_dst in device_dns_mappings:
236 # If the source is not local, then it's inbound traffic, and hence the eth_dst is the MAC of the IoT device.
237 hostname = device_dns_mappings[eth_dst].hostname_for_ip_at_time(ip_src, timestamp)
239 # Use IP if no hostname mapping
241 # Non-smarthome devices can be merged later
242 G.add_node(hostname, Merged='')
246 G.add_node(eth_dst, Name=dev_list[eth_dst])
250 # Check first if the key (eth_dst) exists in the dictionary
251 if eth_src in device_dns_mappings:
252 # If the destination is not local, then it's outbound traffic, and hence the eth_src is the MAC of the IoT device.
253 hostname = device_dns_mappings[eth_src].hostname_for_ip_at_time(ip_dst, timestamp)
255 # Use IP if no hostname mapping
257 # Non-smarthome devices can be merged later
258 G.add_node(hostname, Merged='')
260 G.add_edge(src_node, dst_node, Protocol=protocols_str)
262 # Print DNS mapping for reference
263 #for mac in device_dns_mappings:
264 # ddm = device_dns_mappings[mac]
265 # ddm.print_mappings()
270 # ------------------------------------------------------
271 # Not currently used.
272 # Might be useful later on if we wish to resolve IPs.
273 def get_domain(host):
274 ext_result = tldextract.extract(str(host))
275 # Be consistent with ReCon and keep suffix
276 domain = ext_result.domain + "." + ext_result.suffix
281 socket.inet_aton(addr)
285 # ------------------------------------------------------
288 if __name__ == '__main__':
289 if len(sys.argv) < 3:
290 print "Usage:", sys.argv[0], "input_file output_file"
291 print "outfile_file should end in .gexf"
293 # Input file: Path to JSON file generated from tshark JSON output using Anastasia's script (extract_from_tshark.py).
294 input_file = sys.argv[1]
295 print "[ input_file =", input_file, "]"
296 # Output file: Path to file where the Gephi XML should be written.
297 output_file = sys.argv[2]
298 print "[ output_file =", output_file, "]"
299 # Construct graph from JSON
300 G = parse_json(input_file)
301 # Contract nodes that have the same properties, i.e. same protocols
302 G = traverse_and_merge_nodes(G, DEVICE_MAC_LIST)
303 # Write Graph in Graph Exchange XML format
304 nx.write_gexf(G, output_file)