4 Script that constructs a graph in which hosts are nodes.
5 An edge between two hosts indicate that the hosts communicate.
6 Hosts are labeled and identified by their IPs.
7 The graph is written to a file in Graph Exchange XML format for later import and visual inspection in Gephi.
9 The input to this script is the JSON output by extract_from_tshark.py by Anastasia Shuba.
11 This script is a simplification of Milad Asgari's parser_data_to_gephi.py script.
12 It serves as a baseline for future scripts that want to include more information in the graph.
24 import parser.parse_dns
26 DEVICE_MAC_LIST = "devicelist.dat"
27 COLUMN_MAC = "MAC_address"
28 COLUMN_DEVICE_NAME = "device_name"
31 JSON_KEY_SOURCE = "_source"
32 JSON_KEY_LAYERS = "layers"
33 JSON_KEY_FRAME = "frame"
34 JSON_KEY_FRAME_TIME_EPOCH = "frame.time_epoch"
36 JSON_KEY_ETH_SRC = "eth.src"
37 JSON_KEY_ETH_DST = "eth.dst"
39 JSON_KEY_IP_SRC = "ip.src"
40 JSON_KEY_IP_DST = "ip.dst"
43 JSON_KEY_MDNS = "mdns"
44 JSON_KEY_BOOTP = "bootp"
45 JSON_KEY_SSDP = "ssdp"
46 JSON_KEY_DHCPV6 = "dhcpv6"
47 JSON_KEY_LLMNR = "llmnr"
50 def parse_json(file_path):
52 # Open the device MAC list file
53 with open(DEVICE_MAC_LIST) as csvfile:
54 maclist = csv.DictReader(csvfile, (COLUMN_MAC, COLUMN_DEVICE_NAME))
57 crudelist.append(item)
59 # Create key-value dictionary
61 for item in crudelist:
62 devlist[item[COLUMN_MAC]] = item[COLUMN_DEVICE_NAME]
63 #print item["MAC_address"] + " => " + item["device_name"]
64 #for key, value in devlist.iteritems():
65 # print key + " => " + value
67 # First parse the file once, constructing a map that contains information about individual devices' DNS resolutions.
68 device_dns_mappings = parser.parse_dns.parse_json_dns(file_path) # "./json/eth1.dump.json"
72 # Parse file again, this time constructing a graph of device<->server and device<->device communication.
73 with open(file_path) as jf:
75 # data becomes reference to root JSON object (or in our case json array)
77 # Loop through json objects (packets) in data
79 # p is a JSON object, not an index
80 # Drill down to object containing data from the different layers
81 layers = p[JSON_KEY_SOURCE][JSON_KEY_LAYERS]
83 # Skip all MDNS traffic.
84 if JSON_KEY_MDNS in layers:
87 # Skip all LLMNR traffic.
88 if JSON_KEY_LLMNR in layers:
91 # Skip all SSDP traffic - we don't care about disovery, only the actual communication.
92 if JSON_KEY_SSDP in layers:
95 # Skip all bootp traffic (DHCP related)
96 if JSON_KEY_BOOTP in layers:
99 # Skip DHCPv6 for now.
100 if JSON_KEY_DHCPV6 in layers:
103 # Skip any non udp/non tcp traffic
104 if JSON_KEY_UDP not in layers and JSON_KEY_TCP not in layers:
107 # Fetch timestamp of packet (router's timestamp)
108 packet_timestamp = Decimal(layers[JSON_KEY_FRAME][JSON_KEY_FRAME_TIME_EPOCH])
109 print "timestamp", packet_timestamp
110 # Fetch source and destination MACs
111 eth = layers.get(JSON_KEY_ETH, None)
113 print "[ WARNING: eth data not found ]"
115 eth_src = eth.get(JSON_KEY_ETH_SRC, None)
116 eth_dst = eth.get(JSON_KEY_ETH_DST, None)
117 # And source and destination IPs
118 ip_src = layers[JSON_KEY_IP][JSON_KEY_IP_SRC]
119 ip_dst = layers[JSON_KEY_IP][JSON_KEY_IP_DST]
121 # ipre = re.compile(r'\b192.168.[0-9.]+')
122 # src_is_local = ipre.search(ip_src)
123 # dst_is_local = ipre.search(ip_dst)
124 print "ip.src =", ip_src, "ip.dst =", ip_dst
125 src_is_local = ip_src.startswith("192.168.")
126 dst_is_local = ip_dst.startswith("192.168.")
132 G.add_node(eth_src, Name=devlist[eth_src])
135 # If the source is not local, then it's inbound traffic, and hence the eth_dst is the MAC of the IoT device.
136 hostname = device_dns_mappings[eth_dst].hostname_for_ip_at_time(ip_src, packet_timestamp)
138 # Use IP if no hostname mapping
143 G.add_node(eth_dst, Name=devlist[eth_src])
146 # If the destination is not local, then it's outbound traffic, and hence the eth_src is the MAC of the IoT device.
147 hostname = device_dns_mappings[eth_src].hostname_for_ip_at_time(ip_dst, packet_timestamp)
149 # Use IP if no hostname mapping
153 G.add_edge(src_node, dst_node)
155 # # Traffic can be both outbound and inbound.
156 # # Determine which one of the two by looking up device MAC in DNS map.
158 # if eth_src in device_dns_mappings:
159 # iot_device = eth_src
160 # elif eth_dst in device_dns_mappings:
161 # iot_device = eth_dst
163 # # print "[ WARNING: DNS mapping not found for device with MAC", eth_src, "OR", eth_dst, "]"
164 # # This must be local communication between two IoT devices OR an IoT device talking to a hardcoded IP.
165 # # For now let's assume local communication.
166 # # Add a node for each device and an edge between them.
167 # G.add_node(eth_src, Name=devlist[eth_src])
168 # G.add_node(eth_dst, Name=devlist[eth_src])
169 # G.add_edge(eth_src, eth_dst)
170 # # TODO add regex check on src+dst IP to figure out if hardcoded server IP (e.g. check if one of the two are NOT a 192.168.x.y IP)
172 # # It is outbound traffic if iot_device matches src, otherwise it must be inbound traffic.
173 # outbound_traffic = iot_device == eth_src
177 # ''' Graph construction '''
178 # # No need to check if the Nodes and/or Edges we add already exist:
179 # # NetworkX won't add already existing nodes/edges (except in the case of a MultiGraph or MultiDiGraph (see NetworkX doc)).
181 # # Add a node for each host.
182 # # First add node for IoT device.
183 # G.add_node(iot_device, Name=devlist[eth_src])
184 # # Then add node for the server.
185 # # For this we need to distinguish between outbound and inbound traffic so that we look up the proper IP in our DNS map.
186 # # For outbound traffic, the server's IP is the destination IP.
187 # # For inbound traffic, the server's IP is the source IP.
189 # server_ip = ip_dst if outbound_traffic else ip_src
190 # hostname = device_dns_mappings[iot_device].hostname_for_ip_at_time(server_ip, packet_timestamp)
191 # if hostname is None:
192 # # TODO this can occur when two local devices communicate OR if IoT device has hardcoded server IP.
193 # # However, we only get here for the DNS that have not performed any DNS lookups
194 # # We should use a regex check early in the loop to see if it is two local devices communicating.
195 # # This way we would not have to consider these corner cases later on.
196 # # print "[ WARNING: no ip-hostname mapping found for ip", server_ip, " -- adding eth.src->eth.dst edge, but note that this may be incorrect if IoT device has hardcoded server IP ]"
197 # G.add_node(eth_src, Name=devlist[eth_src])
198 # G.add_node(eth_dst, Name=devlist[eth_src])
199 # G.add_edge(eth_src, eth_dst)
201 # G.add_node(hostname)
202 # # Connect the two nodes we just added.
203 # if outbound_traffic:
204 # G.add_edge(iot_device, hostname)
206 # G.add_edge(hostname, iot_device)
209 # ------------------------------------------------------
210 # Not currently used.
211 # Might be useful later on if we wish to resolve IPs.
212 def get_domain(host):
213 ext_result = tldextract.extract(str(host))
214 # Be consistent with ReCon and keep suffix
215 domain = ext_result.domain + "." + ext_result.suffix
220 socket.inet_aton(addr)
224 # ------------------------------------------------------
226 if __name__ == '__main__':
227 if len(sys.argv) < 3:
228 print "Usage:", sys.argv[0], "input_file output_file"
229 print "outfile_file should end in .gexf"
231 # Input file: Path to JSON file generated from tshark JSON output using Anastasia's script (extract_from_tshark.py).
232 input_file = sys.argv[1]
233 print "[ input_file =", input_file, "]"
234 # Output file: Path to file where the Gephi XML should be written.
235 output_file = sys.argv[2]
236 print "[ output_file =", output_file, "]"
237 # Construct graph from JSON
238 G = parse_json(input_file)
239 # Write Graph in Graph Exchange XML format
240 nx.write_gexf(G, output_file)