4 Script that constructs a graph in which hosts are nodes.
5 An edge between two hosts indicate that the hosts communicate.
6 Hosts are labeled and identified by their IPs.
7 The graph is written to a file in Graph Exchange XML format for later import and visual inspection in Gephi.
9 The input to this script is the JSON output by extract_from_tshark.py by Anastasia Shuba.
11 This script is a simplification of Milad Asgari's parser_data_to_gephi.py script.
12 It serves as a baseline for future scripts that want to include more information in the graph.
23 import parser.parse_dns
25 DEVICE_MAC_LIST = "devicelist.dat"
26 COLUMN_MAC = "MAC_address"
27 COLUMN_DEVICE_NAME = "device_name"
29 JSON_KEY_ETH_SRC = "eth.src"
30 JSON_KEY_ETH_DST = "eth.dst"
32 def parse_json(file_path):
34 # Open the device MAC list file
35 with open(DEVICE_MAC_LIST) as csvfile:
36 maclist = csv.DictReader(csvfile, (COLUMN_MAC, COLUMN_DEVICE_NAME))
39 crudelist.append(item)
41 # Create key-value dictionary
43 for item in crudelist:
44 devlist[item[COLUMN_MAC]] = item[COLUMN_DEVICE_NAME]
45 #print item["MAC_address"] + " => " + item["device_name"]
46 #for key, value in devlist.iteritems():
47 # print key + " => " + value
49 device_dns_mappings = parser.parse_dns.parse_json_dns("./json/dns.json")
53 with open(file_path) as jf:
55 # data becomes reference to root JSON object (or in our case json array)
57 # Loop through json objects in data
59 # Fetch timestamp of packet
60 packet_timestamp = Decimal(data[k]["ts"])
61 # Fetch eth source and destination info
62 eth_src = data[k][JSON_KEY_ETH_SRC]
63 eth_dst = data[k][JSON_KEY_ETH_DST]
64 # Traffic can be both outbound and inbound.
65 # Determine which one of the two by looking up device MAC in DNS map.
67 src = eth_src + "-" + devlist[eth_src]
68 dst = eth_dst + "-" + devlist[eth_dst]
69 if eth_src in device_dns_mappings:
71 elif eth_dst in device_dns_mappings:
74 # print "[ WARNING: DNS mapping not found for device with MAC", eth_src, "OR", eth_dst, "]"
75 # This must be local communication between two IoT devices OR an IoT device talking to a hardcoded IP.
76 # For now let's assume local communication.
77 # Add a node for each device and an edge between them.
81 # TODO add regex check on src+dst IP to figure out if hardcoded server IP (e.g. check if one of the two are NOT a 192.168.x.y IP)
83 # It is outbound traffic if iot_device matches src, otherwise it must be inbound traffic.
84 outbound_traffic = iot_device == eth_src
86 ''' Graph construction '''
87 # No need to check if the Nodes and/or Edges we add already exist:
88 # NetworkX won't add already existing nodes/edges (except in the case of a MultiGraph or MultiDiGraph (see NetworkX doc)).
90 # Add a node for each host.
91 # First add node for IoT device.
92 device = iot_device + "-" + devlist[iot_device]
94 # Then add node for the server.
95 # For this we need to distinguish between outbound and inbound traffic so that we look up the proper IP in our DNS map.
96 # For outbound traffic, the server's IP is the destination IP.
97 # For inbound traffic, the server's IP is the source IP.
98 server_ip = data[k]["dst_ip"] if outbound_traffic else data[k]["src_ip"]
99 hostname = device_dns_mappings[iot_device].hostname_for_ip_at_time(server_ip, packet_timestamp)
101 # TODO this can occur when two local devices communicate OR if IoT device has hardcoded server IP.
102 # However, we only get here for the DNS that have not performed any DNS lookups
103 # We should use a regex check early in the loop to see if it is two local devices communicating.
104 # This way we would not have to consider these corner cases later on.
105 # print "[ WARNING: no ip-hostname mapping found for ip", server_ip, " -- adding eth.src->eth.dst edge, but note that this may be incorrect if IoT device has hardcoded server IP ]"
111 # Connect the two nodes we just added.
113 G.add_edge(device, hostname)
115 G.add_edge(hostname, device)
118 # ------------------------------------------------------
119 # Not currently used.
120 # Might be useful later on if we wish to resolve IPs.
121 def get_domain(host):
122 ext_result = tldextract.extract(str(host))
123 # Be consistent with ReCon and keep suffix
124 domain = ext_result.domain + "." + ext_result.suffix
129 socket.inet_aton(addr)
133 # ------------------------------------------------------
135 if __name__ == '__main__':
136 if len(sys.argv) < 3:
137 print "Usage:", sys.argv[0], "input_file output_file"
138 print "outfile_file should end in .gexf"
140 # Input file: Path to JSON file generated from tshark JSON output using Anastasia's script (extract_from_tshark.py).
141 input_file = sys.argv[1]
142 print "[ input_file =", input_file, "]"
143 # Output file: Path to file where the Gephi XML should be written.
144 output_file = sys.argv[2]
145 print "[ output_file =", output_file, "]"
146 # Construct graph from JSON
147 G = parse_json(input_file)
148 # Write Graph in Graph Exchange XML format
149 nx.write_gexf(G, output_file)