4 Script that constructs a graph in which hosts are nodes.
5 An edge between two hosts indicate that the hosts communicate.
6 Hosts are labeled and identified by their IPs.
7 The graph is written to a file in Graph Exchange XML format for later import and visual inspection in Gephi.
9 The input to this script is the JSON output by extract_from_tshark.py by Anastasia Shuba.
11 This script is a simplification of Milad Asgari's parser_data_to_gephi.py script.
12 It serves as a baseline for future scripts that want to include more information in the graph.
23 import parser.parse_dns
25 DEVICE_MAC_LIST = "devicelist.dat"
26 COLUMN_MAC = "MAC_address"
27 COLUMN_DEVICE_NAME = "device_name"
29 JSON_KEY_ETH_SRC = "eth.src"
30 JSON_KEY_ETH_DST = "eth.dst"
32 def parse_json(file_path):
34 # Open the device MAC list file
35 with open(DEVICE_MAC_LIST) as csvfile:
36 maclist = csv.DictReader(csvfile, (COLUMN_MAC, COLUMN_DEVICE_NAME))
39 crudelist.append(item)
41 # Create key-value dictionary
43 for item in crudelist:
44 devlist[item[COLUMN_MAC]] = item[COLUMN_DEVICE_NAME]
45 #print item["MAC_address"] + " => " + item["device_name"]
46 #for key, value in devlist.iteritems():
47 # print key + " => " + value
49 device_dns_mappings = parser.parse_dns.parse_json_dns("./json/dns.json")
53 with open(file_path) as jf:
55 # data becomes reference to root JSON object (or in our case json array)
57 # Loop through json objects in data
59 # Fetch timestamp of packet
60 packet_timestamp = Decimal(data[k]["ts"])
61 # Fetch eth source and destination info
62 eth_src = data[k][JSON_KEY_ETH_SRC]
63 eth_dst = data[k][JSON_KEY_ETH_DST]
64 # Traffic can be both outbound and inbound.
65 # Determine which one of the two by looking up device MAC in DNS map.
67 if eth_src in device_dns_mappings:
69 elif eth_dst in device_dns_mappings:
72 # print "[ WARNING: DNS mapping not found for device with MAC", eth_src, "OR", eth_dst, "]"
73 # This must be local communication between two IoT devices OR an IoT device talking to a hardcoded IP.
74 # For now let's assume local communication.
75 # Add a node for each device and an edge between them.
76 G.add_node(eth_src, Name=devlist[eth_src])
77 G.add_node(eth_dst, Name=devlist[eth_src])
78 G.add_edge(eth_src, eth_dst)
79 # TODO add regex check on src+dst IP to figure out if hardcoded server IP (e.g. check if one of the two are NOT a 192.168.x.y IP)
81 # It is outbound traffic if iot_device matches src, otherwise it must be inbound traffic.
82 outbound_traffic = iot_device == eth_src
84 ''' Graph construction '''
85 # No need to check if the Nodes and/or Edges we add already exist:
86 # NetworkX won't add already existing nodes/edges (except in the case of a MultiGraph or MultiDiGraph (see NetworkX doc)).
88 # Add a node for each host.
89 # First add node for IoT device.
90 G.add_node(iot_device, Name=devlist[eth_src])
91 # Then add node for the server.
92 # For this we need to distinguish between outbound and inbound traffic so that we look up the proper IP in our DNS map.
93 # For outbound traffic, the server's IP is the destination IP.
94 # For inbound traffic, the server's IP is the source IP.
95 server_ip = data[k]["dst_ip"] if outbound_traffic else data[k]["src_ip"]
96 hostname = device_dns_mappings[iot_device].hostname_for_ip_at_time(server_ip, packet_timestamp)
98 # TODO this can occur when two local devices communicate OR if IoT device has hardcoded server IP.
99 # However, we only get here for the DNS that have not performed any DNS lookups
100 # We should use a regex check early in the loop to see if it is two local devices communicating.
101 # This way we would not have to consider these corner cases later on.
102 # print "[ WARNING: no ip-hostname mapping found for ip", server_ip, " -- adding eth.src->eth.dst edge, but note that this may be incorrect if IoT device has hardcoded server IP ]"
103 G.add_node(eth_src, Name=devlist[eth_src])
104 G.add_node(eth_dst, Name=devlist[eth_src])
105 G.add_edge(eth_src, eth_dst)
108 # Connect the two nodes we just added.
110 G.add_edge(iot_device, hostname)
112 G.add_edge(hostname, iot_device)
115 # ------------------------------------------------------
116 # Not currently used.
117 # Might be useful later on if we wish to resolve IPs.
118 def get_domain(host):
119 ext_result = tldextract.extract(str(host))
120 # Be consistent with ReCon and keep suffix
121 domain = ext_result.domain + "." + ext_result.suffix
126 socket.inet_aton(addr)
130 # ------------------------------------------------------
132 if __name__ == '__main__':
133 if len(sys.argv) < 3:
134 print "Usage:", sys.argv[0], "input_file output_file"
135 print "outfile_file should end in .gexf"
137 # Input file: Path to JSON file generated from tshark JSON output using Anastasia's script (extract_from_tshark.py).
138 input_file = sys.argv[1]
139 print "[ input_file =", input_file, "]"
140 # Output file: Path to file where the Gephi XML should be written.
141 output_file = sys.argv[2]
142 print "[ output_file =", output_file, "]"
143 # Construct graph from JSON
144 G = parse_json(input_file)
145 # Write Graph in Graph Exchange XML format
146 nx.write_gexf(G, output_file)