4 Script that constructs a graph in which hosts are nodes.
5 An edge between two hosts indicate that the hosts communicate.
6 Hosts are labeled and identified by their IPs.
7 The graph is written to a file in Graph Exchange XML format for later import and visual inspection in Gephi.
9 The input to this script is the JSON output by extract_from_tshark.py by Anastasia Shuba.
11 This script is a simplification of Milad Asgari's parser_data_to_gephi.py script.
12 It serves as a baseline for future scripts that want to include more information in the graph.
24 import parser.parse_dns
26 DEVICE_MAC_LIST = "devicelist.dat"
27 COLUMN_MAC = "MAC_address"
28 COLUMN_DEVICE_NAME = "device_name"
31 JSON_KEY_SOURCE = "_source"
32 JSON_KEY_LAYERS = "layers"
33 JSON_KEY_FRAME = "frame"
34 JSON_KEY_FRAME_TIME_EPOCH = "frame.time_epoch"
36 JSON_KEY_ETH_SRC = "eth.src"
37 JSON_KEY_ETH_DST = "eth.dst"
39 JSON_KEY_IP_SRC = "ip.src"
40 JSON_KEY_IP_DST = "ip.dst"
44 # List of checked protocols
45 listchkprot = [ "bootp",
53 def parse_json(file_path):
55 # Open the device MAC list file
56 with open(DEVICE_MAC_LIST) as csvfile:
57 maclist = csv.DictReader(csvfile, (COLUMN_MAC, COLUMN_DEVICE_NAME))
60 crudelist.append(item)
62 # Create key-value dictionary
64 for item in crudelist:
65 devlist[item[COLUMN_MAC]] = item[COLUMN_DEVICE_NAME]
66 #print item["MAC_address"] + " => " + item["device_name"]
67 #for key, value in devlist.iteritems():
68 # print key + " => " + value
70 # First parse the file once, constructing a map that contains information about individual devices' DNS resolutions.
71 device_dns_mappings = parser.parse_dns.parse_json_dns(file_path) # "./json/eth1.dump.json"
75 # Parse file again, this time constructing a graph of device<->server and device<->device communication.
76 with open(file_path) as jf:
78 # data becomes reference to root JSON object (or in our case json array)
81 # Loop through json objects (packets) in data
83 # p is a JSON object, not an index
84 # Drill down to object containing data from the different layers
85 layers = p[JSON_KEY_SOURCE][JSON_KEY_LAYERS]
88 for prot in listchkprot:
94 # Skip any non udp/non tcp traffic
95 if JSON_KEY_UDP not in layers and JSON_KEY_TCP not in layers:
98 # Fetch timestamp of packet (router's timestamp)
99 packet_timestamp = Decimal(layers[JSON_KEY_FRAME][JSON_KEY_FRAME_TIME_EPOCH])
100 print "timestamp", packet_timestamp
101 # Fetch source and destination MACs
102 eth = layers.get(JSON_KEY_ETH, None)
104 print "[ WARNING: eth data not found ]"
106 eth_src = eth.get(JSON_KEY_ETH_SRC, None)
107 eth_dst = eth.get(JSON_KEY_ETH_DST, None)
108 # And source and destination IPs
109 ip_src = layers[JSON_KEY_IP][JSON_KEY_IP_SRC]
110 ip_dst = layers[JSON_KEY_IP][JSON_KEY_IP_DST]
112 # Categorize source and destination IP addresses: local vs. non-local
113 ipre = re.compile(r'\b192.168.[0-9.]+')
114 src_is_local = ipre.search(ip_src)
115 dst_is_local = ipre.search(ip_dst)
116 print "ip.src =", ip_src, "ip.dst =", ip_dst
121 G.add_node(eth_src, Name=devlist[eth_src])
125 # Check first if the key (eth_dst) exists in the dictionary
126 if eth_dst in device_dns_mappings:
127 # If the source is not local, then it's inbound traffic, and hence the eth_dst is the MAC of the IoT device.
128 hostname = device_dns_mappings[eth_dst].hostname_for_ip_at_time(ip_src, packet_timestamp)
130 # Use IP if no hostname mapping
135 G.add_node(eth_dst, Name=devlist[eth_src])
139 # Check first if the key (eth_dst) exists in the dictionary
140 if eth_src in device_dns_mappings:
141 # If the destination is not local, then it's outbound traffic, and hence the eth_src is the MAC of the IoT device.
142 hostname = device_dns_mappings[eth_src].hostname_for_ip_at_time(ip_dst, packet_timestamp)
144 # Use IP if no hostname mapping
148 G.add_edge(src_node, dst_node)
150 # Print DNS mapping for reference
151 for mac in device_dns_mappings:
152 ddm = device_dns_mappings[mac]
157 # ------------------------------------------------------
158 # Not currently used.
159 # Might be useful later on if we wish to resolve IPs.
160 def get_domain(host):
161 ext_result = tldextract.extract(str(host))
162 # Be consistent with ReCon and keep suffix
163 domain = ext_result.domain + "." + ext_result.suffix
168 socket.inet_aton(addr)
172 # ------------------------------------------------------
174 if __name__ == '__main__':
175 if len(sys.argv) < 3:
176 print "Usage:", sys.argv[0], "input_file output_file"
177 print "outfile_file should end in .gexf"
179 # Input file: Path to JSON file generated from tshark JSON output using Anastasia's script (extract_from_tshark.py).
180 input_file = sys.argv[1]
181 print "[ input_file =", input_file, "]"
182 # Output file: Path to file where the Gephi XML should be written.
183 output_file = sys.argv[2]
184 print "[ output_file =", output_file, "]"
185 # Construct graph from JSON
186 G = parse_json(input_file)
187 # Write Graph in Graph Exchange XML format
188 nx.write_gexf(G, output_file)