4 Script used to extract only the needed information from JSON packet traces generated by
\r
5 tshark from PCAPNG format
\r
12 from collections import OrderedDict
\r
14 json_key_source = "_source"
\r
15 json_key_layers = "layers"
\r
18 json_key_tcp = "tcp"
\r
20 json_key_http = "http"
\r
21 json_key_method = "method"
\r
22 json_key_uri = "uri"
\r
23 json_key_headers = "headers"
\r
24 json_key_host = "host"
\r
26 json_key_http_req = json_key_http + ".request."
\r
27 json_key_http_req_method = json_key_http_req + json_key_method
\r
28 json_key_http_req_uri = json_key_http_req + json_key_uri
\r
29 json_key_http_req_line = json_key_http_req + "line"
\r
31 json_key_pkt_comment = "pkt_comment"
\r
33 json_key_frame = "frame"
\r
34 json_key_frame_num = json_key_frame + ".number"
\r
35 json_key_frame_comment = json_key_frame + ".comment"
\r
36 json_key_frame_ts = json_key_frame + ".time_epoch"
\r
39 def make_unique(key, dct):
\r
43 while unique_key in dct:
\r
45 unique_key = '{}_{}'.format(key, counter)
\r
49 def parse_object_pairs(pairs):
\r
51 for key, value in pairs:
\r
53 key = make_unique(key, dct)
\r
58 def change_file(fpath):
\r
59 for fn in os.listdir(fpath):
\r
60 full_path = fpath + '/' + fn
\r
62 # Recursively go through all directories
\r
63 if os.path.isdir(full_path):
\r
64 change_file(full_path)
\r
68 with open(full_path, "r+") as jf:
\r
69 # Since certain json 'keys' appear multiple times in our data, we have to make them
\r
70 # unique first (we can't use regular json.load() or we lose some data points). From:
\r
71 # https://stackoverflow.com/questions/29321677/python-json-parser-allow-duplicate-keys
\r
72 decoder = json.JSONDecoder(object_pairs_hook=parse_object_pairs)
\r
73 pcap_data = decoder.decode(jf.read())
\r
75 # Prepare new data structure for re-formatted JSON storage
\r
77 for packet in pcap_data:
\r
78 layers = packet[json_key_source][json_key_layers]
\r
80 # All captured traffic should have a frame + frame number, but check anyway
\r
81 frame_num = " Frame: "
\r
82 if json_key_frame not in layers or json_key_frame_num not in layers[json_key_frame]:
\r
83 print "WARNING: could not find frame number! Using -1..."
\r
84 frame_num = frame_num + "-1"
\r
86 # Save frame number for error-reporting
\r
87 frame_num = frame_num + layers[json_key_frame][json_key_frame_num]
\r
89 # All captured traffic should be IP, but check anyway
\r
90 if not json_key_ip in layers:
\r
91 print "WARNING: Non-IP traffic detected!" + frame_num
\r
94 # For now, focus on HTTP only
\r
95 if json_key_tcp not in layers or json_key_http not in layers:
\r
98 # Fill our new JSON packet with TCP/IP info
\r
100 new_packet["dst_ip"] = layers[json_key_ip][json_key_ip + ".dst"]
\r
101 new_packet["dst_port"] = int(layers[json_key_tcp][json_key_tcp + ".dstport"])
\r
103 # Go through all HTTP fields and extract the ones that are needed
\r
104 http_data = layers[json_key_http]
\r
105 for http_key in http_data:
\r
106 http_value = http_data[http_key]
\r
108 if http_key.startswith(json_key_http_req_line):
\r
109 header_line = http_value.split(":", 1)
\r
110 if len(header_line) != 2:
\r
111 print ("WARNING: could not parse header '" + str(header_line) + "'"
\r
115 # Prepare container for HTTP headers
\r
116 if json_key_headers not in new_packet:
\r
117 new_packet[json_key_headers] = {}
\r
119 # Use lower case for header keys to stay consistent with our other data
\r
120 header_key = header_line[0].lower()
\r
122 # Remove the trailing carriage return
\r
123 header_val = header_line[1].strip()
\r
125 # Save the header key-value pair
\r
126 new_packet[json_key_headers][header_key] = header_val
\r
128 # If this is the host header, we also save it to the main object
\r
129 if header_key == json_key_host:
\r
130 new_packet[json_key_host] = header_val
\r
132 if json_key_http_req_method in http_value:
\r
133 new_packet[json_key_method] = http_value[json_key_http_req_method]
\r
134 if json_key_http_req_uri in http_value:
\r
135 new_packet[json_key_uri] = http_value[json_key_http_req_uri]
\r
137 # End of HTTP parsing
\r
139 # Check that we found the minimum needed HTTP headers
\r
140 if (json_key_uri not in new_packet or json_key_method not in new_packet or
\r
141 json_key_host not in new_packet):
\r
142 print "Missing some HTTP Headers!" + frame_num
\r
145 # Extract timestamp
\r
146 if json_key_frame_ts not in layers[json_key_frame]:
\r
147 print "WARNING: could not find timestamp!" + frame_num
\r
150 new_packet["ts"] = layers[json_key_frame][json_key_frame_ts]
\r
152 # Now extract and parse the packet comment
\r
153 if (json_key_pkt_comment not in layers or
\r
154 json_key_frame_comment not in layers[json_key_pkt_comment]):
\r
155 print "WARNING: no packet comment found!" + frame_num
\r
158 comment = layers[json_key_pkt_comment][json_key_frame_comment]
\r
159 comment_data = json.loads(comment)
\r
160 for key in comment_data:
\r
161 new_packet[str(key)] = str(comment_data[key])
\r
163 # Create a unique key for each packet to keep consistent with ReCon
\r
164 # Also good in case packets end up in different files
\r
165 data[str(uuid.uuid4())] = new_packet
\r
167 # Write the new data
\r
168 #print json.dumps(data, sort_keys=True, indent=4)
\r
170 jf.write(json.dumps(data, sort_keys=True, indent=4))
\r
173 if __name__ == '__main__':
\r
174 # Needed to re-use some JSON keys
\r
175 change_file(sys.argv[1])