ea716b31e2af0a648af578a358f1d0abd607fcc0
[firefly-linux-kernel-4.4.55.git] / net / netfilter / xt_qtaguid.c
1 /*
2  * Kernel iptables module to track stats for packets based on user tags.
3  *
4  * (C) 2011 Google, Inc
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
10
11 /*
12  * There are run-time debug flags enabled via the debug_mask module param, or
13  * via the DEFAULT_DEBUG_MASK. See xt_qtaguid_internal.h.
14  */
15 #define DEBUG
16
17 #include <linux/file.h>
18 #include <linux/inetdevice.h>
19 #include <linux/module.h>
20 #include <linux/netfilter/x_tables.h>
21 #include <linux/netfilter/xt_qtaguid.h>
22 #include <linux/skbuff.h>
23 #include <linux/workqueue.h>
24 #include <net/addrconf.h>
25 #include <net/sock.h>
26 #include <net/tcp.h>
27 #include <net/udp.h>
28
29 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
30 #include <linux/netfilter_ipv6/ip6_tables.h>
31 #endif
32
33 #include <linux/netfilter/xt_socket.h>
34 #include "xt_qtaguid_internal.h"
35 #include "xt_qtaguid_print.h"
36
37 /*
38  * We only use the xt_socket funcs within a similar context to avoid unexpected
39  * return values.
40  */
41 #define XT_SOCKET_SUPPORTED_HOOKS \
42         ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN))
43
44
45 static const char *module_procdirname = "xt_qtaguid";
46 static struct proc_dir_entry *xt_qtaguid_procdir;
47
48 static unsigned int proc_iface_perms = S_IRUGO;
49 module_param_named(iface_perms, proc_iface_perms, uint, S_IRUGO | S_IWUSR);
50
51 static struct proc_dir_entry *xt_qtaguid_stats_file;
52 static unsigned int proc_stats_perms = S_IRUGO;
53 module_param_named(stats_perms, proc_stats_perms, uint, S_IRUGO | S_IWUSR);
54
55 static struct proc_dir_entry *xt_qtaguid_ctrl_file;
56 #ifdef CONFIG_ANDROID_PARANOID_NETWORK
57 static unsigned int proc_ctrl_perms = S_IRUGO | S_IWUGO;
58 #else
59 static unsigned int proc_ctrl_perms = S_IRUGO | S_IWUSR;
60 #endif
61 module_param_named(ctrl_perms, proc_ctrl_perms, uint, S_IRUGO | S_IWUSR);
62
63 #ifdef CONFIG_ANDROID_PARANOID_NETWORK
64 #include <linux/android_aid.h>
65 static gid_t proc_stats_readall_gid = AID_NET_BW_STATS;
66 static gid_t proc_ctrl_write_gid = AID_NET_BW_ACCT;
67 #else
68 /* 0 means, don't limit anybody */
69 static gid_t proc_stats_readall_gid;
70 static gid_t proc_ctrl_write_gid;
71 #endif
72 module_param_named(stats_readall_gid, proc_stats_readall_gid, uint,
73                    S_IRUGO | S_IWUSR);
74 module_param_named(ctrl_write_gid, proc_ctrl_write_gid, uint,
75                    S_IRUGO | S_IWUSR);
76
77 /*
78  * Limit the number of active tags (via socket tags) for a given UID.
79  * Multiple processes could share the UID.
80  */
81 static int max_sock_tags = DEFAULT_MAX_SOCK_TAGS;
82 module_param(max_sock_tags, int, S_IRUGO | S_IWUSR);
83
84 /*
85  * After the kernel has initiallized this module, it is still possible
86  * to make it passive.
87  * Setting passive to Y:
88  *  - the iface stats handling will not act on notifications.
89  *  - iptables matches will never match.
90  *  - ctrl commands silently succeed.
91  *  - stats are always empty.
92  * This is mostly usefull when a bug is suspected.
93  */
94 static bool module_passive;
95 module_param_named(passive, module_passive, bool, S_IRUGO | S_IWUSR);
96
97 /*
98  * Control how qtaguid data is tracked per proc/uid.
99  * Setting tag_tracking_passive to Y:
100  *  - don't create proc specific structs to track tags
101  *  - don't check that active tag stats exceed some limits.
102  *  - don't clean up socket tags on process exits.
103  * This is mostly usefull when a bug is suspected.
104  */
105 static bool qtu_proc_handling_passive;
106 module_param_named(tag_tracking_passive, qtu_proc_handling_passive, bool,
107                    S_IRUGO | S_IWUSR);
108
109 #define QTU_DEV_NAME "xt_qtaguid"
110
111 uint qtaguid_debug_mask = DEFAULT_DEBUG_MASK;
112 module_param_named(debug_mask, qtaguid_debug_mask, uint, S_IRUGO | S_IWUSR);
113
114 /*---------------------------------------------------------------------------*/
115 static const char *iface_stat_procdirname = "iface_stat";
116 static struct proc_dir_entry *iface_stat_procdir;
117 /*
118  * The iface_stat_all* will go away once userspace gets use to the new fields
119  * that have a format line.
120  */
121 static const char *iface_stat_all_procfilename = "iface_stat_all";
122 static struct proc_dir_entry *iface_stat_all_procfile;
123 static const char *iface_stat_fmt_procfilename = "iface_stat_fmt";
124 static struct proc_dir_entry *iface_stat_fmt_procfile;
125
126
127 /*
128  * Ordering of locks:
129  *  outer locks:
130  *    iface_stat_list_lock
131  *    sock_tag_list_lock
132  *  inner locks:
133  *    uid_tag_data_tree_lock
134  *    tag_counter_set_list_lock
135  * Notice how sock_tag_list_lock is held sometimes when uid_tag_data_tree_lock
136  * is acquired.
137  *
138  * Call tree with all lock holders as of 2012-04-27:
139  *
140  * iface_stat_fmt_proc_read()
141  *   iface_stat_list_lock
142  *     (struct iface_stat)
143  *
144  * qtaguid_ctrl_proc_read()
145  *   sock_tag_list_lock
146  *     (sock_tag_tree)
147  *     (struct proc_qtu_data->sock_tag_list)
148  *   prdebug_full_state()
149  *     sock_tag_list_lock
150  *       (sock_tag_tree)
151  *     uid_tag_data_tree_lock
152  *       (uid_tag_data_tree)
153  *       (proc_qtu_data_tree)
154  *     iface_stat_list_lock
155  *
156  * qtaguid_stats_proc_read()
157  *   iface_stat_list_lock
158  *     struct iface_stat->tag_stat_list_lock
159  *
160  * qtudev_open()
161  *   uid_tag_data_tree_lock
162  *
163  * qtudev_release()
164  *   sock_tag_data_list_lock
165  *     uid_tag_data_tree_lock
166  *   prdebug_full_state()
167  *     sock_tag_list_lock
168  *     uid_tag_data_tree_lock
169  *     iface_stat_list_lock
170  *
171  * iface_netdev_event_handler()
172  *   iface_stat_create()
173  *     iface_stat_list_lock
174  *   iface_stat_update()
175  *     iface_stat_list_lock
176  *
177  * iface_inetaddr_event_handler()
178  *   iface_stat_create()
179  *     iface_stat_list_lock
180  *   iface_stat_update()
181  *     iface_stat_list_lock
182  *
183  * iface_inet6addr_event_handler()
184  *   iface_stat_create_ipv6()
185  *     iface_stat_list_lock
186  *   iface_stat_update()
187  *     iface_stat_list_lock
188  *
189  * qtaguid_mt()
190  *   account_for_uid()
191  *     if_tag_stat_update()
192  *       get_sock_stat()
193  *         sock_tag_list_lock
194  *       struct iface_stat->tag_stat_list_lock
195  *         tag_stat_update()
196  *           get_active_counter_set()
197  *             tag_counter_set_list_lock
198  *         tag_stat_update()
199  *           get_active_counter_set()
200  *             tag_counter_set_list_lock
201  *
202  *
203  * qtaguid_ctrl_parse()
204  *   ctrl_cmd_delete()
205  *     sock_tag_list_lock
206  *     tag_counter_set_list_lock
207  *     iface_stat_list_lock
208  *       struct iface_stat->tag_stat_list_lock
209  *     uid_tag_data_tree_lock
210  *   ctrl_cmd_counter_set()
211  *     tag_counter_set_list_lock
212  *   ctrl_cmd_tag()
213  *     sock_tag_list_lock
214  *       (sock_tag_tree)
215  *       get_tag_ref()
216  *         uid_tag_data_tree_lock
217  *           (uid_tag_data_tree)
218  *       uid_tag_data_tree_lock
219  *         (proc_qtu_data_tree)
220  *   ctrl_cmd_untag()
221  *     sock_tag_list_lock
222  *     uid_tag_data_tree_lock
223  *
224  */
225 static LIST_HEAD(iface_stat_list);
226 static DEFINE_SPINLOCK(iface_stat_list_lock);
227
228 static struct rb_root sock_tag_tree = RB_ROOT;
229 static DEFINE_SPINLOCK(sock_tag_list_lock);
230
231 static struct rb_root tag_counter_set_tree = RB_ROOT;
232 static DEFINE_SPINLOCK(tag_counter_set_list_lock);
233
234 static struct rb_root uid_tag_data_tree = RB_ROOT;
235 static DEFINE_SPINLOCK(uid_tag_data_tree_lock);
236
237 static struct rb_root proc_qtu_data_tree = RB_ROOT;
238 /* No proc_qtu_data_tree_lock; use uid_tag_data_tree_lock */
239
240 static struct qtaguid_event_counts qtu_events;
241 /*----------------------------------------------*/
242 static bool can_manipulate_uids(void)
243 {
244         /* root pwnd */
245         return unlikely(!current_fsuid()) || unlikely(!proc_ctrl_write_gid)
246                 || in_egroup_p(proc_ctrl_write_gid);
247 }
248
249 static bool can_impersonate_uid(uid_t uid)
250 {
251         return uid == current_fsuid() || can_manipulate_uids();
252 }
253
254 static bool can_read_other_uid_stats(uid_t uid)
255 {
256         /* root pwnd */
257         return unlikely(!current_fsuid()) || uid == current_fsuid()
258                 || unlikely(!proc_stats_readall_gid)
259                 || in_egroup_p(proc_stats_readall_gid);
260 }
261
262 static inline void dc_add_byte_packets(struct data_counters *counters, int set,
263                                   enum ifs_tx_rx direction,
264                                   enum ifs_proto ifs_proto,
265                                   int bytes,
266                                   int packets)
267 {
268         counters->bpc[set][direction][ifs_proto].bytes += bytes;
269         counters->bpc[set][direction][ifs_proto].packets += packets;
270 }
271
272 static inline uint64_t dc_sum_bytes(struct data_counters *counters,
273                                     int set,
274                                     enum ifs_tx_rx direction)
275 {
276         return counters->bpc[set][direction][IFS_TCP].bytes
277                 + counters->bpc[set][direction][IFS_UDP].bytes
278                 + counters->bpc[set][direction][IFS_PROTO_OTHER].bytes;
279 }
280
281 static inline uint64_t dc_sum_packets(struct data_counters *counters,
282                                       int set,
283                                       enum ifs_tx_rx direction)
284 {
285         return counters->bpc[set][direction][IFS_TCP].packets
286                 + counters->bpc[set][direction][IFS_UDP].packets
287                 + counters->bpc[set][direction][IFS_PROTO_OTHER].packets;
288 }
289
290 static struct tag_node *tag_node_tree_search(struct rb_root *root, tag_t tag)
291 {
292         struct rb_node *node = root->rb_node;
293
294         while (node) {
295                 struct tag_node *data = rb_entry(node, struct tag_node, node);
296                 int result;
297                 RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
298                          " node=%p data=%p\n", tag, node, data);
299                 result = tag_compare(tag, data->tag);
300                 RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
301                          " data.tag=0x%llx (uid=%u) res=%d\n",
302                          tag, data->tag, get_uid_from_tag(data->tag), result);
303                 if (result < 0)
304                         node = node->rb_left;
305                 else if (result > 0)
306                         node = node->rb_right;
307                 else
308                         return data;
309         }
310         return NULL;
311 }
312
313 static void tag_node_tree_insert(struct tag_node *data, struct rb_root *root)
314 {
315         struct rb_node **new = &(root->rb_node), *parent = NULL;
316
317         /* Figure out where to put new node */
318         while (*new) {
319                 struct tag_node *this = rb_entry(*new, struct tag_node,
320                                                  node);
321                 int result = tag_compare(data->tag, this->tag);
322                 RB_DEBUG("qtaguid: %s(): tag=0x%llx"
323                          " (uid=%u)\n", __func__,
324                          this->tag,
325                          get_uid_from_tag(this->tag));
326                 parent = *new;
327                 if (result < 0)
328                         new = &((*new)->rb_left);
329                 else if (result > 0)
330                         new = &((*new)->rb_right);
331                 else
332                         BUG();
333         }
334
335         /* Add new node and rebalance tree. */
336         rb_link_node(&data->node, parent, new);
337         rb_insert_color(&data->node, root);
338 }
339
340 static void tag_stat_tree_insert(struct tag_stat *data, struct rb_root *root)
341 {
342         tag_node_tree_insert(&data->tn, root);
343 }
344
345 static struct tag_stat *tag_stat_tree_search(struct rb_root *root, tag_t tag)
346 {
347         struct tag_node *node = tag_node_tree_search(root, tag);
348         if (!node)
349                 return NULL;
350         return rb_entry(&node->node, struct tag_stat, tn.node);
351 }
352
353 static void tag_counter_set_tree_insert(struct tag_counter_set *data,
354                                         struct rb_root *root)
355 {
356         tag_node_tree_insert(&data->tn, root);
357 }
358
359 static struct tag_counter_set *tag_counter_set_tree_search(struct rb_root *root,
360                                                            tag_t tag)
361 {
362         struct tag_node *node = tag_node_tree_search(root, tag);
363         if (!node)
364                 return NULL;
365         return rb_entry(&node->node, struct tag_counter_set, tn.node);
366
367 }
368
369 static void tag_ref_tree_insert(struct tag_ref *data, struct rb_root *root)
370 {
371         tag_node_tree_insert(&data->tn, root);
372 }
373
374 static struct tag_ref *tag_ref_tree_search(struct rb_root *root, tag_t tag)
375 {
376         struct tag_node *node = tag_node_tree_search(root, tag);
377         if (!node)
378                 return NULL;
379         return rb_entry(&node->node, struct tag_ref, tn.node);
380 }
381
382 static struct sock_tag *sock_tag_tree_search(struct rb_root *root,
383                                              const struct sock *sk)
384 {
385         struct rb_node *node = root->rb_node;
386
387         while (node) {
388                 struct sock_tag *data = rb_entry(node, struct sock_tag,
389                                                  sock_node);
390                 if (sk < data->sk)
391                         node = node->rb_left;
392                 else if (sk > data->sk)
393                         node = node->rb_right;
394                 else
395                         return data;
396         }
397         return NULL;
398 }
399
400 static void sock_tag_tree_insert(struct sock_tag *data, struct rb_root *root)
401 {
402         struct rb_node **new = &(root->rb_node), *parent = NULL;
403
404         /* Figure out where to put new node */
405         while (*new) {
406                 struct sock_tag *this = rb_entry(*new, struct sock_tag,
407                                                  sock_node);
408                 parent = *new;
409                 if (data->sk < this->sk)
410                         new = &((*new)->rb_left);
411                 else if (data->sk > this->sk)
412                         new = &((*new)->rb_right);
413                 else
414                         BUG();
415         }
416
417         /* Add new node and rebalance tree. */
418         rb_link_node(&data->sock_node, parent, new);
419         rb_insert_color(&data->sock_node, root);
420 }
421
422 static void sock_tag_tree_erase(struct rb_root *st_to_free_tree)
423 {
424         struct rb_node *node;
425         struct sock_tag *st_entry;
426
427         node = rb_first(st_to_free_tree);
428         while (node) {
429                 st_entry = rb_entry(node, struct sock_tag, sock_node);
430                 node = rb_next(node);
431                 CT_DEBUG("qtaguid: %s(): "
432                          "erase st: sk=%p tag=0x%llx (uid=%u)\n", __func__,
433                          st_entry->sk,
434                          st_entry->tag,
435                          get_uid_from_tag(st_entry->tag));
436                 rb_erase(&st_entry->sock_node, st_to_free_tree);
437                 sockfd_put(st_entry->socket);
438                 kfree(st_entry);
439         }
440 }
441
442 static struct proc_qtu_data *proc_qtu_data_tree_search(struct rb_root *root,
443                                                        const pid_t pid)
444 {
445         struct rb_node *node = root->rb_node;
446
447         while (node) {
448                 struct proc_qtu_data *data = rb_entry(node,
449                                                       struct proc_qtu_data,
450                                                       node);
451                 if (pid < data->pid)
452                         node = node->rb_left;
453                 else if (pid > data->pid)
454                         node = node->rb_right;
455                 else
456                         return data;
457         }
458         return NULL;
459 }
460
461 static void proc_qtu_data_tree_insert(struct proc_qtu_data *data,
462                                       struct rb_root *root)
463 {
464         struct rb_node **new = &(root->rb_node), *parent = NULL;
465
466         /* Figure out where to put new node */
467         while (*new) {
468                 struct proc_qtu_data *this = rb_entry(*new,
469                                                       struct proc_qtu_data,
470                                                       node);
471                 parent = *new;
472                 if (data->pid < this->pid)
473                         new = &((*new)->rb_left);
474                 else if (data->pid > this->pid)
475                         new = &((*new)->rb_right);
476                 else
477                         BUG();
478         }
479
480         /* Add new node and rebalance tree. */
481         rb_link_node(&data->node, parent, new);
482         rb_insert_color(&data->node, root);
483 }
484
485 static void uid_tag_data_tree_insert(struct uid_tag_data *data,
486                                      struct rb_root *root)
487 {
488         struct rb_node **new = &(root->rb_node), *parent = NULL;
489
490         /* Figure out where to put new node */
491         while (*new) {
492                 struct uid_tag_data *this = rb_entry(*new,
493                                                      struct uid_tag_data,
494                                                      node);
495                 parent = *new;
496                 if (data->uid < this->uid)
497                         new = &((*new)->rb_left);
498                 else if (data->uid > this->uid)
499                         new = &((*new)->rb_right);
500                 else
501                         BUG();
502         }
503
504         /* Add new node and rebalance tree. */
505         rb_link_node(&data->node, parent, new);
506         rb_insert_color(&data->node, root);
507 }
508
509 static struct uid_tag_data *uid_tag_data_tree_search(struct rb_root *root,
510                                                      uid_t uid)
511 {
512         struct rb_node *node = root->rb_node;
513
514         while (node) {
515                 struct uid_tag_data *data = rb_entry(node,
516                                                      struct uid_tag_data,
517                                                      node);
518                 if (uid < data->uid)
519                         node = node->rb_left;
520                 else if (uid > data->uid)
521                         node = node->rb_right;
522                 else
523                         return data;
524         }
525         return NULL;
526 }
527
528 /*
529  * Allocates a new uid_tag_data struct if needed.
530  * Returns a pointer to the found or allocated uid_tag_data.
531  * Returns a PTR_ERR on failures, and lock is not held.
532  * If found is not NULL:
533  *   sets *found to true if not allocated.
534  *   sets *found to false if allocated.
535  */
536 struct uid_tag_data *get_uid_data(uid_t uid, bool *found_res)
537 {
538         struct uid_tag_data *utd_entry;
539
540         /* Look for top level uid_tag_data for the UID */
541         utd_entry = uid_tag_data_tree_search(&uid_tag_data_tree, uid);
542         DR_DEBUG("qtaguid: get_uid_data(%u) utd=%p\n", uid, utd_entry);
543
544         if (found_res)
545                 *found_res = utd_entry;
546         if (utd_entry)
547                 return utd_entry;
548
549         utd_entry = kzalloc(sizeof(*utd_entry), GFP_ATOMIC);
550         if (!utd_entry) {
551                 pr_err("qtaguid: get_uid_data(%u): "
552                        "tag data alloc failed\n", uid);
553                 return ERR_PTR(-ENOMEM);
554         }
555
556         utd_entry->uid = uid;
557         utd_entry->tag_ref_tree = RB_ROOT;
558         uid_tag_data_tree_insert(utd_entry, &uid_tag_data_tree);
559         DR_DEBUG("qtaguid: get_uid_data(%u) new utd=%p\n", uid, utd_entry);
560         return utd_entry;
561 }
562
563 /* Never returns NULL. Either PTR_ERR or a valid ptr. */
564 static struct tag_ref *new_tag_ref(tag_t new_tag,
565                                    struct uid_tag_data *utd_entry)
566 {
567         struct tag_ref *tr_entry;
568         int res;
569
570         if (utd_entry->num_active_tags + 1 > max_sock_tags) {
571                 pr_info("qtaguid: new_tag_ref(0x%llx): "
572                         "tag ref alloc quota exceeded. max=%d\n",
573                         new_tag, max_sock_tags);
574                 res = -EMFILE;
575                 goto err_res;
576
577         }
578
579         tr_entry = kzalloc(sizeof(*tr_entry), GFP_ATOMIC);
580         if (!tr_entry) {
581                 pr_err("qtaguid: new_tag_ref(0x%llx): "
582                        "tag ref alloc failed\n",
583                        new_tag);
584                 res = -ENOMEM;
585                 goto err_res;
586         }
587         tr_entry->tn.tag = new_tag;
588         /* tr_entry->num_sock_tags  handled by caller */
589         utd_entry->num_active_tags++;
590         tag_ref_tree_insert(tr_entry, &utd_entry->tag_ref_tree);
591         DR_DEBUG("qtaguid: new_tag_ref(0x%llx): "
592                  " inserted new tag ref %p\n",
593                  new_tag, tr_entry);
594         return tr_entry;
595
596 err_res:
597         return ERR_PTR(res);
598 }
599
600 static struct tag_ref *lookup_tag_ref(tag_t full_tag,
601                                       struct uid_tag_data **utd_res)
602 {
603         struct uid_tag_data *utd_entry;
604         struct tag_ref *tr_entry;
605         bool found_utd;
606         uid_t uid = get_uid_from_tag(full_tag);
607
608         DR_DEBUG("qtaguid: lookup_tag_ref(tag=0x%llx (uid=%u))\n",
609                  full_tag, uid);
610
611         utd_entry = get_uid_data(uid, &found_utd);
612         if (IS_ERR_OR_NULL(utd_entry)) {
613                 if (utd_res)
614                         *utd_res = utd_entry;
615                 return NULL;
616         }
617
618         tr_entry = tag_ref_tree_search(&utd_entry->tag_ref_tree, full_tag);
619         if (utd_res)
620                 *utd_res = utd_entry;
621         DR_DEBUG("qtaguid: lookup_tag_ref(0x%llx) utd_entry=%p tr_entry=%p\n",
622                  full_tag, utd_entry, tr_entry);
623         return tr_entry;
624 }
625
626 /* Never returns NULL. Either PTR_ERR or a valid ptr. */
627 static struct tag_ref *get_tag_ref(tag_t full_tag,
628                                    struct uid_tag_data **utd_res)
629 {
630         struct uid_tag_data *utd_entry;
631         struct tag_ref *tr_entry;
632
633         DR_DEBUG("qtaguid: get_tag_ref(0x%llx)\n",
634                  full_tag);
635         spin_lock_bh(&uid_tag_data_tree_lock);
636         tr_entry = lookup_tag_ref(full_tag, &utd_entry);
637         BUG_ON(IS_ERR_OR_NULL(utd_entry));
638         if (!tr_entry)
639                 tr_entry = new_tag_ref(full_tag, utd_entry);
640
641         spin_unlock_bh(&uid_tag_data_tree_lock);
642         if (utd_res)
643                 *utd_res = utd_entry;
644         DR_DEBUG("qtaguid: get_tag_ref(0x%llx) utd=%p tr=%p\n",
645                  full_tag, utd_entry, tr_entry);
646         return tr_entry;
647 }
648
649 /* Checks and maybe frees the UID Tag Data entry */
650 static void put_utd_entry(struct uid_tag_data *utd_entry)
651 {
652         /* Are we done with the UID tag data entry? */
653         if (RB_EMPTY_ROOT(&utd_entry->tag_ref_tree) &&
654                 !utd_entry->num_pqd) {
655                 DR_DEBUG("qtaguid: %s(): "
656                          "erase utd_entry=%p uid=%u "
657                          "by pid=%u tgid=%u uid=%u\n", __func__,
658                          utd_entry, utd_entry->uid,
659                          current->pid, current->tgid, current_fsuid());
660                 BUG_ON(utd_entry->num_active_tags);
661                 rb_erase(&utd_entry->node, &uid_tag_data_tree);
662                 kfree(utd_entry);
663         } else {
664                 DR_DEBUG("qtaguid: %s(): "
665                          "utd_entry=%p still has %d tags %d proc_qtu_data\n",
666                          __func__, utd_entry, utd_entry->num_active_tags,
667                          utd_entry->num_pqd);
668                 BUG_ON(!(utd_entry->num_active_tags ||
669                          utd_entry->num_pqd));
670         }
671 }
672
673 /*
674  * If no sock_tags are using this tag_ref,
675  * decrements refcount of utd_entry, removes tr_entry
676  * from utd_entry->tag_ref_tree and frees.
677  */
678 static void free_tag_ref_from_utd_entry(struct tag_ref *tr_entry,
679                                         struct uid_tag_data *utd_entry)
680 {
681         DR_DEBUG("qtaguid: %s(): %p tag=0x%llx (uid=%u)\n", __func__,
682                  tr_entry, tr_entry->tn.tag,
683                  get_uid_from_tag(tr_entry->tn.tag));
684         if (!tr_entry->num_sock_tags) {
685                 BUG_ON(!utd_entry->num_active_tags);
686                 utd_entry->num_active_tags--;
687                 rb_erase(&tr_entry->tn.node, &utd_entry->tag_ref_tree);
688                 DR_DEBUG("qtaguid: %s(): erased %p\n", __func__, tr_entry);
689                 kfree(tr_entry);
690         }
691 }
692
693 static void put_tag_ref_tree(tag_t full_tag, struct uid_tag_data *utd_entry)
694 {
695         struct rb_node *node;
696         struct tag_ref *tr_entry;
697         tag_t acct_tag;
698
699         DR_DEBUG("qtaguid: %s(tag=0x%llx (uid=%u))\n", __func__,
700                  full_tag, get_uid_from_tag(full_tag));
701         acct_tag = get_atag_from_tag(full_tag);
702         node = rb_first(&utd_entry->tag_ref_tree);
703         while (node) {
704                 tr_entry = rb_entry(node, struct tag_ref, tn.node);
705                 node = rb_next(node);
706                 if (!acct_tag || tr_entry->tn.tag == full_tag)
707                         free_tag_ref_from_utd_entry(tr_entry, utd_entry);
708         }
709 }
710
711 static int read_proc_u64(char *page, char **start, off_t off,
712                         int count, int *eof, void *data)
713 {
714         int len;
715         uint64_t value;
716         char *p = page;
717         uint64_t *iface_entry = data;
718
719         if (!data)
720                 return 0;
721
722         value = *iface_entry;
723         p += sprintf(p, "%llu\n", value);
724         len = (p - page) - off;
725         *eof = (len <= count) ? 1 : 0;
726         *start = page + off;
727         return len;
728 }
729
730 static int read_proc_bool(char *page, char **start, off_t off,
731                         int count, int *eof, void *data)
732 {
733         int len;
734         bool value;
735         char *p = page;
736         bool *bool_entry = data;
737
738         if (!data)
739                 return 0;
740
741         value = *bool_entry;
742         p += sprintf(p, "%u\n", value);
743         len = (p - page) - off;
744         *eof = (len <= count) ? 1 : 0;
745         *start = page + off;
746         return len;
747 }
748
749 static int get_active_counter_set(tag_t tag)
750 {
751         int active_set = 0;
752         struct tag_counter_set *tcs;
753
754         MT_DEBUG("qtaguid: get_active_counter_set(tag=0x%llx)"
755                  " (uid=%u)\n",
756                  tag, get_uid_from_tag(tag));
757         /* For now we only handle UID tags for active sets */
758         tag = get_utag_from_tag(tag);
759         spin_lock_bh(&tag_counter_set_list_lock);
760         tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
761         if (tcs)
762                 active_set = tcs->active_set;
763         spin_unlock_bh(&tag_counter_set_list_lock);
764         return active_set;
765 }
766
767 /*
768  * Find the entry for tracking the specified interface.
769  * Caller must hold iface_stat_list_lock
770  */
771 static struct iface_stat *get_iface_entry(const char *ifname)
772 {
773         struct iface_stat *iface_entry;
774
775         /* Find the entry for tracking the specified tag within the interface */
776         if (ifname == NULL) {
777                 pr_info("qtaguid: iface_stat: get() NULL device name\n");
778                 return NULL;
779         }
780
781         /* Iterate over interfaces */
782         list_for_each_entry(iface_entry, &iface_stat_list, list) {
783                 if (!strcmp(ifname, iface_entry->ifname))
784                         goto done;
785         }
786         iface_entry = NULL;
787 done:
788         return iface_entry;
789 }
790
791 static int iface_stat_fmt_proc_read(char *page, char **num_items_returned,
792                                     off_t items_to_skip, int char_count,
793                                     int *eof, void *data)
794 {
795         char *outp = page;
796         int item_index = 0;
797         int len;
798         int fmt = (int)data; /* The data is just 1 (old) or 2 (uses fmt) */
799         struct iface_stat *iface_entry;
800         struct rtnl_link_stats64 dev_stats, *stats;
801         struct rtnl_link_stats64 no_dev_stats = {0};
802
803         if (unlikely(module_passive)) {
804                 *eof = 1;
805                 return 0;
806         }
807
808         CT_DEBUG("qtaguid:proc iface_stat_fmt "
809                  "pid=%u tgid=%u uid=%u "
810                  "page=%p *num_items_returned=%p off=%ld "
811                  "char_count=%d *eof=%d\n",
812                  current->pid, current->tgid, current_fsuid(),
813                  page, *num_items_returned,
814                  items_to_skip, char_count, *eof);
815
816         if (*eof)
817                 return 0;
818
819         if (fmt == 2 && item_index++ >= items_to_skip) {
820                 len = snprintf(outp, char_count,
821                                "ifname "
822                                "total_skb_rx_bytes total_skb_rx_packets "
823                                "total_skb_tx_bytes total_skb_tx_packets\n"
824                         );
825                 if (len >= char_count) {
826                         *outp = '\0';
827                         return outp - page;
828                 }
829                 outp += len;
830                 char_count -= len;
831                 (*num_items_returned)++;
832         }
833
834         /*
835          * This lock will prevent iface_stat_update() from changing active,
836          * and in turn prevent an interface from unregistering itself.
837          */
838         spin_lock_bh(&iface_stat_list_lock);
839         list_for_each_entry(iface_entry, &iface_stat_list, list) {
840                 if (item_index++ < items_to_skip)
841                         continue;
842
843                 if (iface_entry->active) {
844                         stats = dev_get_stats(iface_entry->net_dev,
845                                               &dev_stats);
846                 } else {
847                         stats = &no_dev_stats;
848                 }
849                 /*
850                  * If the meaning of the data changes, then update the fmtX
851                  * string.
852                  */
853                 if (fmt == 1) {
854                         len = snprintf(
855                                 outp, char_count,
856                                 "%s %d "
857                                 "%llu %llu %llu %llu "
858                                 "%llu %llu %llu %llu\n",
859                                 iface_entry->ifname,
860                                 iface_entry->active,
861                                 iface_entry->totals_via_dev[IFS_RX].bytes,
862                                 iface_entry->totals_via_dev[IFS_RX].packets,
863                                 iface_entry->totals_via_dev[IFS_TX].bytes,
864                                 iface_entry->totals_via_dev[IFS_TX].packets,
865                                 stats->rx_bytes, stats->rx_packets,
866                                 stats->tx_bytes, stats->tx_packets
867                                 );
868                 } else {
869                         len = snprintf(
870                                 outp, char_count,
871                                 "%s "
872                                 "%llu %llu %llu %llu\n",
873                                 iface_entry->ifname,
874                                 iface_entry->totals_via_skb[IFS_RX].bytes,
875                                 iface_entry->totals_via_skb[IFS_RX].packets,
876                                 iface_entry->totals_via_skb[IFS_TX].bytes,
877                                 iface_entry->totals_via_skb[IFS_TX].packets
878                                 );
879                 }
880                 if (len >= char_count) {
881                         spin_unlock_bh(&iface_stat_list_lock);
882                         *outp = '\0';
883                         return outp - page;
884                 }
885                 outp += len;
886                 char_count -= len;
887                 (*num_items_returned)++;
888         }
889         spin_unlock_bh(&iface_stat_list_lock);
890
891         *eof = 1;
892         return outp - page;
893 }
894
895 static void iface_create_proc_worker(struct work_struct *work)
896 {
897         struct proc_dir_entry *proc_entry;
898         struct iface_stat_work *isw = container_of(work, struct iface_stat_work,
899                                                    iface_work);
900         struct iface_stat *new_iface  = isw->iface_entry;
901
902         /* iface_entries are not deleted, so safe to manipulate. */
903         proc_entry = proc_mkdir(new_iface->ifname, iface_stat_procdir);
904         if (IS_ERR_OR_NULL(proc_entry)) {
905                 pr_err("qtaguid: iface_stat: create_proc(): alloc failed.\n");
906                 kfree(isw);
907                 return;
908         }
909
910         new_iface->proc_ptr = proc_entry;
911
912         create_proc_read_entry("tx_bytes", proc_iface_perms, proc_entry,
913                                read_proc_u64,
914                                &new_iface->totals_via_dev[IFS_TX].bytes);
915         create_proc_read_entry("rx_bytes", proc_iface_perms, proc_entry,
916                                read_proc_u64,
917                                &new_iface->totals_via_dev[IFS_RX].bytes);
918         create_proc_read_entry("tx_packets", proc_iface_perms, proc_entry,
919                                read_proc_u64,
920                                &new_iface->totals_via_dev[IFS_TX].packets);
921         create_proc_read_entry("rx_packets", proc_iface_perms, proc_entry,
922                                read_proc_u64,
923                                &new_iface->totals_via_dev[IFS_RX].packets);
924         create_proc_read_entry("active", proc_iface_perms, proc_entry,
925                         read_proc_bool, &new_iface->active);
926
927         IF_DEBUG("qtaguid: iface_stat: create_proc(): done "
928                  "entry=%p dev=%s\n", new_iface, new_iface->ifname);
929         kfree(isw);
930 }
931
932 /*
933  * Will set the entry's active state, and
934  * update the net_dev accordingly also.
935  */
936 static void _iface_stat_set_active(struct iface_stat *entry,
937                                    struct net_device *net_dev,
938                                    bool activate)
939 {
940         if (activate) {
941                 entry->net_dev = net_dev;
942                 entry->active = true;
943                 IF_DEBUG("qtaguid: %s(%s): "
944                          "enable tracking. rfcnt=%d\n", __func__,
945                          entry->ifname,
946                          percpu_read(*net_dev->pcpu_refcnt));
947         } else {
948                 entry->active = false;
949                 entry->net_dev = NULL;
950                 IF_DEBUG("qtaguid: %s(%s): "
951                          "disable tracking. rfcnt=%d\n", __func__,
952                          entry->ifname,
953                          percpu_read(*net_dev->pcpu_refcnt));
954
955         }
956 }
957
958 /* Caller must hold iface_stat_list_lock */
959 static struct iface_stat *iface_alloc(struct net_device *net_dev)
960 {
961         struct iface_stat *new_iface;
962         struct iface_stat_work *isw;
963
964         new_iface = kzalloc(sizeof(*new_iface), GFP_ATOMIC);
965         if (new_iface == NULL) {
966                 pr_err("qtaguid: iface_stat: create(%s): "
967                        "iface_stat alloc failed\n", net_dev->name);
968                 return NULL;
969         }
970         new_iface->ifname = kstrdup(net_dev->name, GFP_ATOMIC);
971         if (new_iface->ifname == NULL) {
972                 pr_err("qtaguid: iface_stat: create(%s): "
973                        "ifname alloc failed\n", net_dev->name);
974                 kfree(new_iface);
975                 return NULL;
976         }
977         spin_lock_init(&new_iface->tag_stat_list_lock);
978         new_iface->tag_stat_tree = RB_ROOT;
979         _iface_stat_set_active(new_iface, net_dev, true);
980
981         /*
982          * ipv6 notifier chains are atomic :(
983          * No create_proc_read_entry() for you!
984          */
985         isw = kmalloc(sizeof(*isw), GFP_ATOMIC);
986         if (!isw) {
987                 pr_err("qtaguid: iface_stat: create(%s): "
988                        "work alloc failed\n", new_iface->ifname);
989                 _iface_stat_set_active(new_iface, net_dev, false);
990                 kfree(new_iface->ifname);
991                 kfree(new_iface);
992                 return NULL;
993         }
994         isw->iface_entry = new_iface;
995         INIT_WORK(&isw->iface_work, iface_create_proc_worker);
996         schedule_work(&isw->iface_work);
997         list_add(&new_iface->list, &iface_stat_list);
998         return new_iface;
999 }
1000
1001 static void iface_check_stats_reset_and_adjust(struct net_device *net_dev,
1002                                                struct iface_stat *iface)
1003 {
1004         struct rtnl_link_stats64 dev_stats, *stats;
1005         bool stats_rewound;
1006
1007         stats = dev_get_stats(net_dev, &dev_stats);
1008         /* No empty packets */
1009         stats_rewound =
1010                 (stats->rx_bytes < iface->last_known[IFS_RX].bytes)
1011                 || (stats->tx_bytes < iface->last_known[IFS_TX].bytes);
1012
1013         IF_DEBUG("qtaguid: %s(%s): iface=%p netdev=%p "
1014                  "bytes rx/tx=%llu/%llu "
1015                  "active=%d last_known=%d "
1016                  "stats_rewound=%d\n", __func__,
1017                  net_dev ? net_dev->name : "?",
1018                  iface, net_dev,
1019                  stats->rx_bytes, stats->tx_bytes,
1020                  iface->active, iface->last_known_valid, stats_rewound);
1021
1022         if (iface->active && iface->last_known_valid && stats_rewound) {
1023                 pr_warn_once("qtaguid: iface_stat: %s(%s): "
1024                              "iface reset its stats unexpectedly\n", __func__,
1025                              net_dev->name);
1026
1027                 iface->totals_via_dev[IFS_TX].bytes +=
1028                         iface->last_known[IFS_TX].bytes;
1029                 iface->totals_via_dev[IFS_TX].packets +=
1030                         iface->last_known[IFS_TX].packets;
1031                 iface->totals_via_dev[IFS_RX].bytes +=
1032                         iface->last_known[IFS_RX].bytes;
1033                 iface->totals_via_dev[IFS_RX].packets +=
1034                         iface->last_known[IFS_RX].packets;
1035                 iface->last_known_valid = false;
1036                 IF_DEBUG("qtaguid: %s(%s): iface=%p "
1037                          "used last known bytes rx/tx=%llu/%llu\n", __func__,
1038                          iface->ifname, iface, iface->last_known[IFS_RX].bytes,
1039                          iface->last_known[IFS_TX].bytes);
1040         }
1041 }
1042
1043 /*
1044  * Create a new entry for tracking the specified interface.
1045  * Do nothing if the entry already exists.
1046  * Called when an interface is configured with a valid IP address.
1047  */
1048 static void iface_stat_create(struct net_device *net_dev,
1049                               struct in_ifaddr *ifa)
1050 {
1051         struct in_device *in_dev = NULL;
1052         const char *ifname;
1053         struct iface_stat *entry;
1054         __be32 ipaddr = 0;
1055         struct iface_stat *new_iface;
1056
1057         IF_DEBUG("qtaguid: iface_stat: create(%s): ifa=%p netdev=%p\n",
1058                  net_dev ? net_dev->name : "?",
1059                  ifa, net_dev);
1060         if (!net_dev) {
1061                 pr_err("qtaguid: iface_stat: create(): no net dev\n");
1062                 return;
1063         }
1064
1065         ifname = net_dev->name;
1066         if (!ifa) {
1067                 in_dev = in_dev_get(net_dev);
1068                 if (!in_dev) {
1069                         pr_err("qtaguid: iface_stat: create(%s): no inet dev\n",
1070                                ifname);
1071                         return;
1072                 }
1073                 IF_DEBUG("qtaguid: iface_stat: create(%s): in_dev=%p\n",
1074                          ifname, in_dev);
1075                 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
1076                         IF_DEBUG("qtaguid: iface_stat: create(%s): "
1077                                  "ifa=%p ifa_label=%s\n",
1078                                  ifname, ifa,
1079                                  ifa->ifa_label ? ifa->ifa_label : "(null)");
1080                         if (ifa->ifa_label && !strcmp(ifname, ifa->ifa_label))
1081                                 break;
1082                 }
1083         }
1084
1085         if (!ifa) {
1086                 IF_DEBUG("qtaguid: iface_stat: create(%s): no matching IP\n",
1087                          ifname);
1088                 goto done_put;
1089         }
1090         ipaddr = ifa->ifa_local;
1091
1092         spin_lock_bh(&iface_stat_list_lock);
1093         entry = get_iface_entry(ifname);
1094         if (entry != NULL) {
1095                 bool activate = !ipv4_is_loopback(ipaddr);
1096                 IF_DEBUG("qtaguid: iface_stat: create(%s): entry=%p\n",
1097                          ifname, entry);
1098                 iface_check_stats_reset_and_adjust(net_dev, entry);
1099                 _iface_stat_set_active(entry, net_dev, activate);
1100                 IF_DEBUG("qtaguid: %s(%s): "
1101                          "tracking now %d on ip=%pI4\n", __func__,
1102                          entry->ifname, activate, &ipaddr);
1103                 goto done_unlock_put;
1104         } else if (ipv4_is_loopback(ipaddr)) {
1105                 IF_DEBUG("qtaguid: iface_stat: create(%s): "
1106                          "ignore loopback dev. ip=%pI4\n", ifname, &ipaddr);
1107                 goto done_unlock_put;
1108         }
1109
1110         new_iface = iface_alloc(net_dev);
1111         IF_DEBUG("qtaguid: iface_stat: create(%s): done "
1112                  "entry=%p ip=%pI4\n", ifname, new_iface, &ipaddr);
1113 done_unlock_put:
1114         spin_unlock_bh(&iface_stat_list_lock);
1115 done_put:
1116         if (in_dev)
1117                 in_dev_put(in_dev);
1118 }
1119
1120 static void iface_stat_create_ipv6(struct net_device *net_dev,
1121                                    struct inet6_ifaddr *ifa)
1122 {
1123         struct in_device *in_dev;
1124         const char *ifname;
1125         struct iface_stat *entry;
1126         struct iface_stat *new_iface;
1127         int addr_type;
1128
1129         IF_DEBUG("qtaguid: iface_stat: create6(): ifa=%p netdev=%p->name=%s\n",
1130                  ifa, net_dev, net_dev ? net_dev->name : "");
1131         if (!net_dev) {
1132                 pr_err("qtaguid: iface_stat: create6(): no net dev!\n");
1133                 return;
1134         }
1135         ifname = net_dev->name;
1136
1137         in_dev = in_dev_get(net_dev);
1138         if (!in_dev) {
1139                 pr_err("qtaguid: iface_stat: create6(%s): no inet dev\n",
1140                        ifname);
1141                 return;
1142         }
1143
1144         IF_DEBUG("qtaguid: iface_stat: create6(%s): in_dev=%p\n",
1145                  ifname, in_dev);
1146
1147         if (!ifa) {
1148                 IF_DEBUG("qtaguid: iface_stat: create6(%s): no matching IP\n",
1149                          ifname);
1150                 goto done_put;
1151         }
1152         addr_type = ipv6_addr_type(&ifa->addr);
1153
1154         spin_lock_bh(&iface_stat_list_lock);
1155         entry = get_iface_entry(ifname);
1156         if (entry != NULL) {
1157                 bool activate = !(addr_type & IPV6_ADDR_LOOPBACK);
1158                 IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
1159                          ifname, entry);
1160                 iface_check_stats_reset_and_adjust(net_dev, entry);
1161                 _iface_stat_set_active(entry, net_dev, activate);
1162                 IF_DEBUG("qtaguid: %s(%s): "
1163                          "tracking now %d on ip=%pI6c\n", __func__,
1164                          entry->ifname, activate, &ifa->addr);
1165                 goto done_unlock_put;
1166         } else if (addr_type & IPV6_ADDR_LOOPBACK) {
1167                 IF_DEBUG("qtaguid: %s(%s): "
1168                          "ignore loopback dev. ip=%pI6c\n", __func__,
1169                          ifname, &ifa->addr);
1170                 goto done_unlock_put;
1171         }
1172
1173         new_iface = iface_alloc(net_dev);
1174         IF_DEBUG("qtaguid: iface_stat: create6(%s): done "
1175                  "entry=%p ip=%pI6c\n", ifname, new_iface, &ifa->addr);
1176
1177 done_unlock_put:
1178         spin_unlock_bh(&iface_stat_list_lock);
1179 done_put:
1180         in_dev_put(in_dev);
1181 }
1182
1183 static struct sock_tag *get_sock_stat_nl(const struct sock *sk)
1184 {
1185         MT_DEBUG("qtaguid: get_sock_stat_nl(sk=%p)\n", sk);
1186         return sock_tag_tree_search(&sock_tag_tree, sk);
1187 }
1188
1189 static struct sock_tag *get_sock_stat(const struct sock *sk)
1190 {
1191         struct sock_tag *sock_tag_entry;
1192         MT_DEBUG("qtaguid: get_sock_stat(sk=%p)\n", sk);
1193         if (!sk)
1194                 return NULL;
1195         spin_lock_bh(&sock_tag_list_lock);
1196         sock_tag_entry = get_sock_stat_nl(sk);
1197         spin_unlock_bh(&sock_tag_list_lock);
1198         return sock_tag_entry;
1199 }
1200
1201 static int ipx_proto(const struct sk_buff *skb,
1202                      struct xt_action_param *par)
1203 {
1204         int thoff, tproto;
1205
1206         switch (par->family) {
1207         case NFPROTO_IPV6:
1208                 tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
1209                 if (tproto < 0)
1210                         MT_DEBUG("%s(): transport header not found in ipv6"
1211                                  " skb=%p\n", __func__, skb);
1212                 break;
1213         case NFPROTO_IPV4:
1214                 tproto = ip_hdr(skb)->protocol;
1215                 break;
1216         default:
1217                 tproto = IPPROTO_RAW;
1218         }
1219         return tproto;
1220 }
1221
1222 static void
1223 data_counters_update(struct data_counters *dc, int set,
1224                      enum ifs_tx_rx direction, int proto, int bytes)
1225 {
1226         switch (proto) {
1227         case IPPROTO_TCP:
1228                 dc_add_byte_packets(dc, set, direction, IFS_TCP, bytes, 1);
1229                 break;
1230         case IPPROTO_UDP:
1231                 dc_add_byte_packets(dc, set, direction, IFS_UDP, bytes, 1);
1232                 break;
1233         case IPPROTO_IP:
1234         default:
1235                 dc_add_byte_packets(dc, set, direction, IFS_PROTO_OTHER, bytes,
1236                                     1);
1237                 break;
1238         }
1239 }
1240
1241 /*
1242  * Update stats for the specified interface. Do nothing if the entry
1243  * does not exist (when a device was never configured with an IP address).
1244  * Called when an device is being unregistered.
1245  */
1246 static void iface_stat_update(struct net_device *net_dev, bool stash_only)
1247 {
1248         struct rtnl_link_stats64 dev_stats, *stats;
1249         struct iface_stat *entry;
1250
1251         stats = dev_get_stats(net_dev, &dev_stats);
1252         spin_lock_bh(&iface_stat_list_lock);
1253         entry = get_iface_entry(net_dev->name);
1254         if (entry == NULL) {
1255                 IF_DEBUG("qtaguid: iface_stat: update(%s): not tracked\n",
1256                          net_dev->name);
1257                 spin_unlock_bh(&iface_stat_list_lock);
1258                 return;
1259         }
1260
1261         IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
1262                  net_dev->name, entry);
1263         if (!entry->active) {
1264                 IF_DEBUG("qtaguid: %s(%s): already disabled\n", __func__,
1265                          net_dev->name);
1266                 spin_unlock_bh(&iface_stat_list_lock);
1267                 return;
1268         }
1269
1270         if (stash_only) {
1271                 entry->last_known[IFS_TX].bytes = stats->tx_bytes;
1272                 entry->last_known[IFS_TX].packets = stats->tx_packets;
1273                 entry->last_known[IFS_RX].bytes = stats->rx_bytes;
1274                 entry->last_known[IFS_RX].packets = stats->rx_packets;
1275                 entry->last_known_valid = true;
1276                 IF_DEBUG("qtaguid: %s(%s): "
1277                          "dev stats stashed rx/tx=%llu/%llu\n", __func__,
1278                          net_dev->name, stats->rx_bytes, stats->tx_bytes);
1279                 spin_unlock_bh(&iface_stat_list_lock);
1280                 return;
1281         }
1282         entry->totals_via_dev[IFS_TX].bytes += stats->tx_bytes;
1283         entry->totals_via_dev[IFS_TX].packets += stats->tx_packets;
1284         entry->totals_via_dev[IFS_RX].bytes += stats->rx_bytes;
1285         entry->totals_via_dev[IFS_RX].packets += stats->rx_packets;
1286         /* We don't need the last_known[] anymore */
1287         entry->last_known_valid = false;
1288         _iface_stat_set_active(entry, net_dev, false);
1289         IF_DEBUG("qtaguid: %s(%s): "
1290                  "disable tracking. rx/tx=%llu/%llu\n", __func__,
1291                  net_dev->name, stats->rx_bytes, stats->tx_bytes);
1292         spin_unlock_bh(&iface_stat_list_lock);
1293 }
1294
1295 /*
1296  * Update stats for the specified interface from the skb.
1297  * Do nothing if the entry
1298  * does not exist (when a device was never configured with an IP address).
1299  * Called on each sk.
1300  */
1301 static void iface_stat_update_from_skb(const struct sk_buff *skb,
1302                                        struct xt_action_param *par)
1303 {
1304         struct iface_stat *entry;
1305         const struct net_device *el_dev;
1306         enum ifs_tx_rx direction = par->in ? IFS_RX : IFS_TX;
1307         int bytes = skb->len;
1308
1309         if (!skb->dev) {
1310                 MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum);
1311                 el_dev = par->in ? : par->out;
1312         } else {
1313                 const struct net_device *other_dev;
1314                 el_dev = skb->dev;
1315                 other_dev = par->in ? : par->out;
1316                 if (el_dev != other_dev) {
1317                         MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs "
1318                                  "par->(in/out)=%p %s\n",
1319                                  par->hooknum, el_dev, el_dev->name, other_dev,
1320                                  other_dev->name);
1321                 }
1322         }
1323
1324         if (unlikely(!el_dev)) {
1325                 pr_err("qtaguid[%d]: %s(): no par->in/out?!!\n",
1326                        par->hooknum, __func__);
1327                 BUG();
1328         } else if (unlikely(!el_dev->name)) {
1329                 pr_err("qtaguid[%d]: %s(): no dev->name?!!\n",
1330                        par->hooknum, __func__);
1331                 BUG();
1332         } else {
1333                 int proto = ipx_proto(skb, par);
1334                 MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d\n",
1335                          par->hooknum, el_dev->name, el_dev->type,
1336                          par->family, proto);
1337         }
1338
1339         spin_lock_bh(&iface_stat_list_lock);
1340         entry = get_iface_entry(el_dev->name);
1341         if (entry == NULL) {
1342                 IF_DEBUG("qtaguid: iface_stat: %s(%s): not tracked\n",
1343                          __func__, el_dev->name);
1344                 spin_unlock_bh(&iface_stat_list_lock);
1345                 return;
1346         }
1347
1348         IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
1349                  el_dev->name, entry);
1350
1351         entry->totals_via_skb[direction].bytes += bytes;
1352         entry->totals_via_skb[direction].packets++;
1353         spin_unlock_bh(&iface_stat_list_lock);
1354 }
1355
1356 static void tag_stat_update(struct tag_stat *tag_entry,
1357                         enum ifs_tx_rx direction, int proto, int bytes)
1358 {
1359         int active_set;
1360         active_set = get_active_counter_set(tag_entry->tn.tag);
1361         MT_DEBUG("qtaguid: tag_stat_update(tag=0x%llx (uid=%u) set=%d "
1362                  "dir=%d proto=%d bytes=%d)\n",
1363                  tag_entry->tn.tag, get_uid_from_tag(tag_entry->tn.tag),
1364                  active_set, direction, proto, bytes);
1365         data_counters_update(&tag_entry->counters, active_set, direction,
1366                              proto, bytes);
1367         if (tag_entry->parent_counters)
1368                 data_counters_update(tag_entry->parent_counters, active_set,
1369                                      direction, proto, bytes);
1370 }
1371
1372 /*
1373  * Create a new entry for tracking the specified {acct_tag,uid_tag} within
1374  * the interface.
1375  * iface_entry->tag_stat_list_lock should be held.
1376  */
1377 static struct tag_stat *create_if_tag_stat(struct iface_stat *iface_entry,
1378                                            tag_t tag)
1379 {
1380         struct tag_stat *new_tag_stat_entry = NULL;
1381         IF_DEBUG("qtaguid: iface_stat: %s(): ife=%p tag=0x%llx"
1382                  " (uid=%u)\n", __func__,
1383                  iface_entry, tag, get_uid_from_tag(tag));
1384         new_tag_stat_entry = kzalloc(sizeof(*new_tag_stat_entry), GFP_ATOMIC);
1385         if (!new_tag_stat_entry) {
1386                 pr_err("qtaguid: iface_stat: tag stat alloc failed\n");
1387                 goto done;
1388         }
1389         new_tag_stat_entry->tn.tag = tag;
1390         tag_stat_tree_insert(new_tag_stat_entry, &iface_entry->tag_stat_tree);
1391 done:
1392         return new_tag_stat_entry;
1393 }
1394
1395 static void if_tag_stat_update(const char *ifname, uid_t uid,
1396                                const struct sock *sk, enum ifs_tx_rx direction,
1397                                int proto, int bytes)
1398 {
1399         struct tag_stat *tag_stat_entry;
1400         tag_t tag, acct_tag;
1401         tag_t uid_tag;
1402         struct data_counters *uid_tag_counters;
1403         struct sock_tag *sock_tag_entry;
1404         struct iface_stat *iface_entry;
1405         struct tag_stat *new_tag_stat = NULL;
1406         MT_DEBUG("qtaguid: if_tag_stat_update(ifname=%s "
1407                 "uid=%u sk=%p dir=%d proto=%d bytes=%d)\n",
1408                  ifname, uid, sk, direction, proto, bytes);
1409
1410
1411         iface_entry = get_iface_entry(ifname);
1412         if (!iface_entry) {
1413                 pr_err("qtaguid: iface_stat: stat_update() %s not found\n",
1414                        ifname);
1415                 return;
1416         }
1417         /* It is ok to process data when an iface_entry is inactive */
1418
1419         MT_DEBUG("qtaguid: iface_stat: stat_update() dev=%s entry=%p\n",
1420                  ifname, iface_entry);
1421
1422         /*
1423          * Look for a tagged sock.
1424          * It will have an acct_uid.
1425          */
1426         sock_tag_entry = get_sock_stat(sk);
1427         if (sock_tag_entry) {
1428                 tag = sock_tag_entry->tag;
1429                 acct_tag = get_atag_from_tag(tag);
1430                 uid_tag = get_utag_from_tag(tag);
1431         } else {
1432                 acct_tag = make_atag_from_value(0);
1433                 tag = combine_atag_with_uid(acct_tag, uid);
1434                 uid_tag = make_tag_from_uid(uid);
1435         }
1436         MT_DEBUG("qtaguid: iface_stat: stat_update(): "
1437                  " looking for tag=0x%llx (uid=%u) in ife=%p\n",
1438                  tag, get_uid_from_tag(tag), iface_entry);
1439         /* Loop over tag list under this interface for {acct_tag,uid_tag} */
1440         spin_lock_bh(&iface_entry->tag_stat_list_lock);
1441
1442         tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree,
1443                                               tag);
1444         if (tag_stat_entry) {
1445                 /*
1446                  * Updating the {acct_tag, uid_tag} entry handles both stats:
1447                  * {0, uid_tag} will also get updated.
1448                  */
1449                 tag_stat_update(tag_stat_entry, direction, proto, bytes);
1450                 spin_unlock_bh(&iface_entry->tag_stat_list_lock);
1451                 return;
1452         }
1453
1454         /* Loop over tag list under this interface for {0,uid_tag} */
1455         tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree,
1456                                               uid_tag);
1457         if (!tag_stat_entry) {
1458                 /* Here: the base uid_tag did not exist */
1459                 /*
1460                  * No parent counters. So
1461                  *  - No {0, uid_tag} stats and no {acc_tag, uid_tag} stats.
1462                  */
1463                 new_tag_stat = create_if_tag_stat(iface_entry, uid_tag);
1464                 uid_tag_counters = &new_tag_stat->counters;
1465         } else {
1466                 uid_tag_counters = &tag_stat_entry->counters;
1467         }
1468
1469         if (acct_tag) {
1470                 /* Create the child {acct_tag, uid_tag} and hook up parent. */
1471                 new_tag_stat = create_if_tag_stat(iface_entry, tag);
1472                 new_tag_stat->parent_counters = uid_tag_counters;
1473         } else {
1474                 /*
1475                  * For new_tag_stat to be still NULL here would require:
1476                  *  {0, uid_tag} exists
1477                  *  and {acct_tag, uid_tag} doesn't exist
1478                  *  AND acct_tag == 0.
1479                  * Impossible. This reassures us that new_tag_stat
1480                  * below will always be assigned.
1481                  */
1482                 BUG_ON(!new_tag_stat);
1483         }
1484         tag_stat_update(new_tag_stat, direction, proto, bytes);
1485         spin_unlock_bh(&iface_entry->tag_stat_list_lock);
1486 }
1487
1488 static int iface_netdev_event_handler(struct notifier_block *nb,
1489                                       unsigned long event, void *ptr) {
1490         struct net_device *dev = ptr;
1491
1492         if (unlikely(module_passive))
1493                 return NOTIFY_DONE;
1494
1495         IF_DEBUG("qtaguid: iface_stat: netdev_event(): "
1496                  "ev=0x%lx/%s netdev=%p->name=%s\n",
1497                  event, netdev_evt_str(event), dev, dev ? dev->name : "");
1498
1499         switch (event) {
1500         case NETDEV_UP:
1501                 iface_stat_create(dev, NULL);
1502                 atomic64_inc(&qtu_events.iface_events);
1503                 break;
1504         case NETDEV_DOWN:
1505         case NETDEV_UNREGISTER:
1506                 iface_stat_update(dev, event == NETDEV_DOWN);
1507                 atomic64_inc(&qtu_events.iface_events);
1508                 break;
1509         }
1510         return NOTIFY_DONE;
1511 }
1512
1513 static int iface_inet6addr_event_handler(struct notifier_block *nb,
1514                                          unsigned long event, void *ptr)
1515 {
1516         struct inet6_ifaddr *ifa = ptr;
1517         struct net_device *dev;
1518
1519         if (unlikely(module_passive))
1520                 return NOTIFY_DONE;
1521
1522         IF_DEBUG("qtaguid: iface_stat: inet6addr_event(): "
1523                  "ev=0x%lx/%s ifa=%p\n",
1524                  event, netdev_evt_str(event), ifa);
1525
1526         switch (event) {
1527         case NETDEV_UP:
1528                 BUG_ON(!ifa || !ifa->idev);
1529                 dev = (struct net_device *)ifa->idev->dev;
1530                 iface_stat_create_ipv6(dev, ifa);
1531                 atomic64_inc(&qtu_events.iface_events);
1532                 break;
1533         case NETDEV_DOWN:
1534         case NETDEV_UNREGISTER:
1535                 BUG_ON(!ifa || !ifa->idev);
1536                 dev = (struct net_device *)ifa->idev->dev;
1537                 iface_stat_update(dev, event == NETDEV_DOWN);
1538                 atomic64_inc(&qtu_events.iface_events);
1539                 break;
1540         }
1541         return NOTIFY_DONE;
1542 }
1543
1544 static int iface_inetaddr_event_handler(struct notifier_block *nb,
1545                                         unsigned long event, void *ptr)
1546 {
1547         struct in_ifaddr *ifa = ptr;
1548         struct net_device *dev;
1549
1550         if (unlikely(module_passive))
1551                 return NOTIFY_DONE;
1552
1553         IF_DEBUG("qtaguid: iface_stat: inetaddr_event(): "
1554                  "ev=0x%lx/%s ifa=%p\n",
1555                  event, netdev_evt_str(event), ifa);
1556
1557         switch (event) {
1558         case NETDEV_UP:
1559                 BUG_ON(!ifa || !ifa->ifa_dev);
1560                 dev = ifa->ifa_dev->dev;
1561                 iface_stat_create(dev, ifa);
1562                 atomic64_inc(&qtu_events.iface_events);
1563                 break;
1564         case NETDEV_DOWN:
1565         case NETDEV_UNREGISTER:
1566                 BUG_ON(!ifa || !ifa->ifa_dev);
1567                 dev = ifa->ifa_dev->dev;
1568                 iface_stat_update(dev, event == NETDEV_DOWN);
1569                 atomic64_inc(&qtu_events.iface_events);
1570                 break;
1571         }
1572         return NOTIFY_DONE;
1573 }
1574
1575 static struct notifier_block iface_netdev_notifier_blk = {
1576         .notifier_call = iface_netdev_event_handler,
1577 };
1578
1579 static struct notifier_block iface_inetaddr_notifier_blk = {
1580         .notifier_call = iface_inetaddr_event_handler,
1581 };
1582
1583 static struct notifier_block iface_inet6addr_notifier_blk = {
1584         .notifier_call = iface_inet6addr_event_handler,
1585 };
1586
1587 static int __init iface_stat_init(struct proc_dir_entry *parent_procdir)
1588 {
1589         int err;
1590
1591         iface_stat_procdir = proc_mkdir(iface_stat_procdirname, parent_procdir);
1592         if (!iface_stat_procdir) {
1593                 pr_err("qtaguid: iface_stat: init failed to create proc entry\n");
1594                 err = -1;
1595                 goto err;
1596         }
1597
1598         iface_stat_all_procfile = create_proc_entry(iface_stat_all_procfilename,
1599                                                     proc_iface_perms,
1600                                                     parent_procdir);
1601         if (!iface_stat_all_procfile) {
1602                 pr_err("qtaguid: iface_stat: init "
1603                        " failed to create stat_old proc entry\n");
1604                 err = -1;
1605                 goto err_zap_entry;
1606         }
1607         iface_stat_all_procfile->read_proc = iface_stat_fmt_proc_read;
1608         iface_stat_all_procfile->data = (void *)1; /* fmt1 */
1609
1610         iface_stat_fmt_procfile = create_proc_entry(iface_stat_fmt_procfilename,
1611                                                     proc_iface_perms,
1612                                                     parent_procdir);
1613         if (!iface_stat_fmt_procfile) {
1614                 pr_err("qtaguid: iface_stat: init "
1615                        " failed to create stat_all proc entry\n");
1616                 err = -1;
1617                 goto err_zap_all_stats_entry;
1618         }
1619         iface_stat_fmt_procfile->read_proc = iface_stat_fmt_proc_read;
1620         iface_stat_fmt_procfile->data = (void *)2; /* fmt2 */
1621
1622
1623         err = register_netdevice_notifier(&iface_netdev_notifier_blk);
1624         if (err) {
1625                 pr_err("qtaguid: iface_stat: init "
1626                        "failed to register dev event handler\n");
1627                 goto err_zap_all_stats_entries;
1628         }
1629         err = register_inetaddr_notifier(&iface_inetaddr_notifier_blk);
1630         if (err) {
1631                 pr_err("qtaguid: iface_stat: init "
1632                        "failed to register ipv4 dev event handler\n");
1633                 goto err_unreg_nd;
1634         }
1635
1636         err = register_inet6addr_notifier(&iface_inet6addr_notifier_blk);
1637         if (err) {
1638                 pr_err("qtaguid: iface_stat: init "
1639                        "failed to register ipv6 dev event handler\n");
1640                 goto err_unreg_ip4_addr;
1641         }
1642         return 0;
1643
1644 err_unreg_ip4_addr:
1645         unregister_inetaddr_notifier(&iface_inetaddr_notifier_blk);
1646 err_unreg_nd:
1647         unregister_netdevice_notifier(&iface_netdev_notifier_blk);
1648 err_zap_all_stats_entries:
1649         remove_proc_entry(iface_stat_fmt_procfilename, parent_procdir);
1650 err_zap_all_stats_entry:
1651         remove_proc_entry(iface_stat_all_procfilename, parent_procdir);
1652 err_zap_entry:
1653         remove_proc_entry(iface_stat_procdirname, parent_procdir);
1654 err:
1655         return err;
1656 }
1657
1658 static struct sock *qtaguid_find_sk(const struct sk_buff *skb,
1659                                     struct xt_action_param *par)
1660 {
1661         struct sock *sk;
1662         unsigned int hook_mask = (1 << par->hooknum);
1663
1664         MT_DEBUG("qtaguid: find_sk(skb=%p) hooknum=%d family=%d\n", skb,
1665                  par->hooknum, par->family);
1666
1667         /*
1668          * Let's not abuse the the xt_socket_get*_sk(), or else it will
1669          * return garbage SKs.
1670          */
1671         if (!(hook_mask & XT_SOCKET_SUPPORTED_HOOKS))
1672                 return NULL;
1673
1674         switch (par->family) {
1675         case NFPROTO_IPV6:
1676                 sk = xt_socket_get6_sk(skb, par);
1677                 break;
1678         case NFPROTO_IPV4:
1679                 sk = xt_socket_get4_sk(skb, par);
1680                 break;
1681         default:
1682                 return NULL;
1683         }
1684
1685         /*
1686          * Seems to be issues on the file ptr for TCP_TIME_WAIT SKs.
1687          * http://kerneltrap.org/mailarchive/linux-netdev/2010/10/21/6287959
1688          * Not fixed in 3.0-r3 :(
1689          */
1690         if (sk) {
1691                 MT_DEBUG("qtaguid: %p->sk_proto=%u "
1692                          "->sk_state=%d\n", sk, sk->sk_protocol, sk->sk_state);
1693                 if (sk->sk_state  == TCP_TIME_WAIT) {
1694                         xt_socket_put_sk(sk);
1695                         sk = NULL;
1696                 }
1697         }
1698         return sk;
1699 }
1700
1701 static void account_for_uid(const struct sk_buff *skb,
1702                             const struct sock *alternate_sk, uid_t uid,
1703                             struct xt_action_param *par)
1704 {
1705         const struct net_device *el_dev;
1706
1707         if (!skb->dev) {
1708                 MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum);
1709                 el_dev = par->in ? : par->out;
1710         } else {
1711                 const struct net_device *other_dev;
1712                 el_dev = skb->dev;
1713                 other_dev = par->in ? : par->out;
1714                 if (el_dev != other_dev) {
1715                         MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs "
1716                                 "par->(in/out)=%p %s\n",
1717                                 par->hooknum, el_dev, el_dev->name, other_dev,
1718                                 other_dev->name);
1719                 }
1720         }
1721
1722         if (unlikely(!el_dev)) {
1723                 pr_info("qtaguid[%d]: no par->in/out?!!\n", par->hooknum);
1724         } else if (unlikely(!el_dev->name)) {
1725                 pr_info("qtaguid[%d]: no dev->name?!!\n", par->hooknum);
1726         } else {
1727                 int proto = ipx_proto(skb, par);
1728                 MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d\n",
1729                          par->hooknum, el_dev->name, el_dev->type,
1730                          par->family, proto);
1731
1732                 if_tag_stat_update(el_dev->name, uid,
1733                                 skb->sk ? skb->sk : alternate_sk,
1734                                 par->in ? IFS_RX : IFS_TX,
1735                                 proto, skb->len);
1736         }
1737 }
1738
1739 static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
1740 {
1741         const struct xt_qtaguid_match_info *info = par->matchinfo;
1742         const struct file *filp;
1743         bool got_sock = false;
1744         struct sock *sk;
1745         uid_t sock_uid;
1746         bool res;
1747
1748         if (unlikely(module_passive))
1749                 return (info->match ^ info->invert) == 0;
1750
1751         MT_DEBUG("qtaguid[%d]: entered skb=%p par->in=%p/out=%p fam=%d\n",
1752                  par->hooknum, skb, par->in, par->out, par->family);
1753
1754         atomic64_inc(&qtu_events.match_calls);
1755         if (skb == NULL) {
1756                 res = (info->match ^ info->invert) == 0;
1757                 goto ret_res;
1758         }
1759
1760         switch (par->hooknum) {
1761         case NF_INET_PRE_ROUTING:
1762         case NF_INET_POST_ROUTING:
1763                 atomic64_inc(&qtu_events.match_calls_prepost);
1764                 iface_stat_update_from_skb(skb, par);
1765                 /*
1766                  * We are done in pre/post. The skb will get processed
1767                  * further alter.
1768                  */
1769                 res = (info->match ^ info->invert);
1770                 goto ret_res;
1771                 break;
1772         /* default: Fall through and do UID releated work */
1773         }
1774
1775         sk = skb->sk;
1776         if (sk == NULL) {
1777                 /*
1778                  * A missing sk->sk_socket happens when packets are in-flight
1779                  * and the matching socket is already closed and gone.
1780                  */
1781                 sk = qtaguid_find_sk(skb, par);
1782                 /*
1783                  * If we got the socket from the find_sk(), we will need to put
1784                  * it back, as nf_tproxy_get_sock_v4() got it.
1785                  */
1786                 got_sock = sk;
1787                 if (sk)
1788                         atomic64_inc(&qtu_events.match_found_sk_in_ct);
1789                 else
1790                         atomic64_inc(&qtu_events.match_found_no_sk_in_ct);
1791         } else {
1792                 atomic64_inc(&qtu_events.match_found_sk);
1793         }
1794         MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n",
1795                  par->hooknum, sk, got_sock, par->family, ipx_proto(skb, par));
1796         if (sk != NULL) {
1797                 MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n",
1798                         par->hooknum, sk, sk->sk_socket,
1799                         sk->sk_socket ? sk->sk_socket->file : (void *)-1LL);
1800                 filp = sk->sk_socket ? sk->sk_socket->file : NULL;
1801                 MT_DEBUG("qtaguid[%d]: filp...uid=%u\n",
1802                         par->hooknum, filp ? filp->f_cred->fsuid : -1);
1803         }
1804
1805         if (sk == NULL || sk->sk_socket == NULL) {
1806                 /*
1807                  * Here, the qtaguid_find_sk() using connection tracking
1808                  * couldn't find the owner, so for now we just count them
1809                  * against the system.
1810                  */
1811                 /*
1812                  * TODO: unhack how to force just accounting.
1813                  * For now we only do iface stats when the uid-owner is not
1814                  * requested.
1815                  */
1816                 if (!(info->match & XT_QTAGUID_UID))
1817                         account_for_uid(skb, sk, 0, par);
1818                 MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n",
1819                         par->hooknum,
1820                         sk ? sk->sk_socket : NULL);
1821                 res = (info->match ^ info->invert) == 0;
1822                 atomic64_inc(&qtu_events.match_no_sk);
1823                 goto put_sock_ret_res;
1824         } else if (info->match & info->invert & XT_QTAGUID_SOCKET) {
1825                 res = false;
1826                 goto put_sock_ret_res;
1827         }
1828         filp = sk->sk_socket->file;
1829         if (filp == NULL) {
1830                 MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum);
1831                 account_for_uid(skb, sk, 0, par);
1832                 res = ((info->match ^ info->invert) &
1833                         (XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0;
1834                 atomic64_inc(&qtu_events.match_no_sk_file);
1835                 goto put_sock_ret_res;
1836         }
1837         sock_uid = filp->f_cred->fsuid;
1838         /*
1839          * TODO: unhack how to force just accounting.
1840          * For now we only do iface stats when the uid-owner is not requested
1841          */
1842         if (!(info->match & XT_QTAGUID_UID))
1843                 account_for_uid(skb, sk, sock_uid, par);
1844
1845         /*
1846          * The following two tests fail the match when:
1847          *    id not in range AND no inverted condition requested
1848          * or id     in range AND    inverted condition requested
1849          * Thus (!a && b) || (a && !b) == a ^ b
1850          */
1851         if (info->match & XT_QTAGUID_UID)
1852                 if ((filp->f_cred->fsuid >= info->uid_min &&
1853                      filp->f_cred->fsuid <= info->uid_max) ^
1854                     !(info->invert & XT_QTAGUID_UID)) {
1855                         MT_DEBUG("qtaguid[%d]: leaving uid not matching\n",
1856                                  par->hooknum);
1857                         res = false;
1858                         goto put_sock_ret_res;
1859                 }
1860         if (info->match & XT_QTAGUID_GID)
1861                 if ((filp->f_cred->fsgid >= info->gid_min &&
1862                                 filp->f_cred->fsgid <= info->gid_max) ^
1863                         !(info->invert & XT_QTAGUID_GID)) {
1864                         MT_DEBUG("qtaguid[%d]: leaving gid not matching\n",
1865                                 par->hooknum);
1866                         res = false;
1867                         goto put_sock_ret_res;
1868                 }
1869
1870         MT_DEBUG("qtaguid[%d]: leaving matched\n", par->hooknum);
1871         res = true;
1872
1873 put_sock_ret_res:
1874         if (got_sock)
1875                 xt_socket_put_sk(sk);
1876 ret_res:
1877         MT_DEBUG("qtaguid[%d]: left %d\n", par->hooknum, res);
1878         return res;
1879 }
1880
1881 #ifdef DDEBUG
1882 /* This function is not in xt_qtaguid_print.c because of locks visibility */
1883 static void prdebug_full_state(int indent_level, const char *fmt, ...)
1884 {
1885         va_list args;
1886         char *fmt_buff;
1887         char *buff;
1888
1889         if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
1890                 return;
1891
1892         fmt_buff = kasprintf(GFP_ATOMIC,
1893                              "qtaguid: %s(): %s {\n", __func__, fmt);
1894         BUG_ON(!fmt_buff);
1895         va_start(args, fmt);
1896         buff = kvasprintf(GFP_ATOMIC,
1897                           fmt_buff, args);
1898         BUG_ON(!buff);
1899         pr_debug("%s", buff);
1900         kfree(fmt_buff);
1901         kfree(buff);
1902         va_end(args);
1903
1904         spin_lock_bh(&sock_tag_list_lock);
1905         prdebug_sock_tag_tree(indent_level, &sock_tag_tree);
1906         spin_unlock_bh(&sock_tag_list_lock);
1907
1908         spin_lock_bh(&sock_tag_list_lock);
1909         spin_lock_bh(&uid_tag_data_tree_lock);
1910         prdebug_uid_tag_data_tree(indent_level, &uid_tag_data_tree);
1911         prdebug_proc_qtu_data_tree(indent_level, &proc_qtu_data_tree);
1912         spin_unlock_bh(&uid_tag_data_tree_lock);
1913         spin_unlock_bh(&sock_tag_list_lock);
1914
1915         spin_lock_bh(&iface_stat_list_lock);
1916         prdebug_iface_stat_list(indent_level, &iface_stat_list);
1917         spin_unlock_bh(&iface_stat_list_lock);
1918
1919         pr_debug("qtaguid: %s(): }\n", __func__);
1920 }
1921 #else
1922 static void prdebug_full_state(int indent_level, const char *fmt, ...) {}
1923 #endif
1924
1925 /*
1926  * Procfs reader to get all active socket tags using style "1)" as described in
1927  * fs/proc/generic.c
1928  */
1929 static int qtaguid_ctrl_proc_read(char *page, char **num_items_returned,
1930                                   off_t items_to_skip, int char_count, int *eof,
1931                                   void *data)
1932 {
1933         char *outp = page;
1934         int len;
1935         uid_t uid;
1936         struct rb_node *node;
1937         struct sock_tag *sock_tag_entry;
1938         int item_index = 0;
1939         int indent_level = 0;
1940         long f_count;
1941
1942         if (unlikely(module_passive)) {
1943                 *eof = 1;
1944                 return 0;
1945         }
1946
1947         if (*eof)
1948                 return 0;
1949
1950         CT_DEBUG("qtaguid: proc ctrl pid=%u tgid=%u uid=%u "
1951                  "page=%p off=%ld char_count=%d *eof=%d\n",
1952                  current->pid, current->tgid, current_fsuid(),
1953                  page, items_to_skip, char_count, *eof);
1954
1955         spin_lock_bh(&sock_tag_list_lock);
1956         for (node = rb_first(&sock_tag_tree);
1957              node;
1958              node = rb_next(node)) {
1959                 if (item_index++ < items_to_skip)
1960                         continue;
1961                 sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
1962                 uid = get_uid_from_tag(sock_tag_entry->tag);
1963                 CT_DEBUG("qtaguid: proc_read(): sk=%p tag=0x%llx (uid=%u) "
1964                          "pid=%u\n",
1965                          sock_tag_entry->sk,
1966                          sock_tag_entry->tag,
1967                          uid,
1968                          sock_tag_entry->pid
1969                         );
1970                 f_count = atomic_long_read(
1971                         &sock_tag_entry->socket->file->f_count);
1972                 len = snprintf(outp, char_count,
1973                                "sock=%p tag=0x%llx (uid=%u) pid=%u "
1974                                "f_count=%lu\n",
1975                                sock_tag_entry->sk,
1976                                sock_tag_entry->tag, uid,
1977                                sock_tag_entry->pid, f_count);
1978                 if (len >= char_count) {
1979                         spin_unlock_bh(&sock_tag_list_lock);
1980                         *outp = '\0';
1981                         return outp - page;
1982                 }
1983                 outp += len;
1984                 char_count -= len;
1985                 (*num_items_returned)++;
1986         }
1987         spin_unlock_bh(&sock_tag_list_lock);
1988
1989         if (item_index++ >= items_to_skip) {
1990                 len = snprintf(outp, char_count,
1991                                "events: sockets_tagged=%llu "
1992                                "sockets_untagged=%llu "
1993                                "counter_set_changes=%llu "
1994                                "delete_cmds=%llu "
1995                                "iface_events=%llu "
1996                                "match_calls=%llu "
1997                                "match_calls_prepost=%llu "
1998                                "match_found_sk=%llu "
1999                                "match_found_sk_in_ct=%llu "
2000                                "match_found_no_sk_in_ct=%llu "
2001                                "match_no_sk=%llu "
2002                                "match_no_sk_file=%llu\n",
2003                                atomic64_read(&qtu_events.sockets_tagged),
2004                                atomic64_read(&qtu_events.sockets_untagged),
2005                                atomic64_read(&qtu_events.counter_set_changes),
2006                                atomic64_read(&qtu_events.delete_cmds),
2007                                atomic64_read(&qtu_events.iface_events),
2008                                atomic64_read(&qtu_events.match_calls),
2009                                atomic64_read(&qtu_events.match_calls_prepost),
2010                                atomic64_read(&qtu_events.match_found_sk),
2011                                atomic64_read(&qtu_events.match_found_sk_in_ct),
2012                                atomic64_read(
2013                                        &qtu_events.match_found_no_sk_in_ct),
2014                                atomic64_read(&qtu_events.match_no_sk),
2015                                atomic64_read(&qtu_events.match_no_sk_file));
2016                 if (len >= char_count) {
2017                         *outp = '\0';
2018                         return outp - page;
2019                 }
2020                 outp += len;
2021                 char_count -= len;
2022                 (*num_items_returned)++;
2023         }
2024
2025         /* Count the following as part of the last item_index */
2026         if (item_index > items_to_skip) {
2027                 prdebug_full_state(indent_level, "proc ctrl");
2028         }
2029
2030         *eof = 1;
2031         return outp - page;
2032 }
2033
2034 /*
2035  * Delete socket tags, and stat tags associated with a given
2036  * accouting tag and uid.
2037  */
2038 static int ctrl_cmd_delete(const char *input)
2039 {
2040         char cmd;
2041         uid_t uid;
2042         uid_t entry_uid;
2043         tag_t acct_tag;
2044         tag_t tag;
2045         int res, argc;
2046         struct iface_stat *iface_entry;
2047         struct rb_node *node;
2048         struct sock_tag *st_entry;
2049         struct rb_root st_to_free_tree = RB_ROOT;
2050         struct tag_stat *ts_entry;
2051         struct tag_counter_set *tcs_entry;
2052         struct tag_ref *tr_entry;
2053         struct uid_tag_data *utd_entry;
2054
2055         argc = sscanf(input, "%c %llu %u", &cmd, &acct_tag, &uid);
2056         CT_DEBUG("qtaguid: ctrl_delete(%s): argc=%d cmd=%c "
2057                  "user_tag=0x%llx uid=%u\n", input, argc, cmd,
2058                  acct_tag, uid);
2059         if (argc < 2) {
2060                 res = -EINVAL;
2061                 goto err;
2062         }
2063         if (!valid_atag(acct_tag)) {
2064                 pr_info("qtaguid: ctrl_delete(%s): invalid tag\n", input);
2065                 res = -EINVAL;
2066                 goto err;
2067         }
2068         if (argc < 3) {
2069                 uid = current_fsuid();
2070         } else if (!can_impersonate_uid(uid)) {
2071                 pr_info("qtaguid: ctrl_delete(%s): "
2072                         "insufficient priv from pid=%u tgid=%u uid=%u\n",
2073                         input, current->pid, current->tgid, current_fsuid());
2074                 res = -EPERM;
2075                 goto err;
2076         }
2077
2078         tag = combine_atag_with_uid(acct_tag, uid);
2079         CT_DEBUG("qtaguid: ctrl_delete(%s): "
2080                  "looking for tag=0x%llx (uid=%u)\n",
2081                  input, tag, uid);
2082
2083         /* Delete socket tags */
2084         spin_lock_bh(&sock_tag_list_lock);
2085         node = rb_first(&sock_tag_tree);
2086         while (node) {
2087                 st_entry = rb_entry(node, struct sock_tag, sock_node);
2088                 entry_uid = get_uid_from_tag(st_entry->tag);
2089                 node = rb_next(node);
2090                 if (entry_uid != uid)
2091                         continue;
2092
2093                 CT_DEBUG("qtaguid: ctrl_delete(%s): st tag=0x%llx (uid=%u)\n",
2094                          input, st_entry->tag, entry_uid);
2095
2096                 if (!acct_tag || st_entry->tag == tag) {
2097                         rb_erase(&st_entry->sock_node, &sock_tag_tree);
2098                         /* Can't sockfd_put() within spinlock, do it later. */
2099                         sock_tag_tree_insert(st_entry, &st_to_free_tree);
2100                         tr_entry = lookup_tag_ref(st_entry->tag, NULL);
2101                         BUG_ON(tr_entry->num_sock_tags <= 0);
2102                         tr_entry->num_sock_tags--;
2103                         /*
2104                          * TODO: remove if, and start failing.
2105                          * This is a hack to work around the fact that in some
2106                          * places we have "if (IS_ERR_OR_NULL(pqd_entry))"
2107                          * and are trying to work around apps
2108                          * that didn't open the /dev/xt_qtaguid.
2109                          */
2110                         if (st_entry->list.next && st_entry->list.prev)
2111                                 list_del(&st_entry->list);
2112                 }
2113         }
2114         spin_unlock_bh(&sock_tag_list_lock);
2115
2116         sock_tag_tree_erase(&st_to_free_tree);
2117
2118         /* Delete tag counter-sets */
2119         spin_lock_bh(&tag_counter_set_list_lock);
2120         /* Counter sets are only on the uid tag, not full tag */
2121         tcs_entry = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
2122         if (tcs_entry) {
2123                 CT_DEBUG("qtaguid: ctrl_delete(%s): "
2124                          "erase tcs: tag=0x%llx (uid=%u) set=%d\n",
2125                          input,
2126                          tcs_entry->tn.tag,
2127                          get_uid_from_tag(tcs_entry->tn.tag),
2128                          tcs_entry->active_set);
2129                 rb_erase(&tcs_entry->tn.node, &tag_counter_set_tree);
2130                 kfree(tcs_entry);
2131         }
2132         spin_unlock_bh(&tag_counter_set_list_lock);
2133
2134         /*
2135          * If acct_tag is 0, then all entries belonging to uid are
2136          * erased.
2137          */
2138         spin_lock_bh(&iface_stat_list_lock);
2139         list_for_each_entry(iface_entry, &iface_stat_list, list) {
2140                 spin_lock_bh(&iface_entry->tag_stat_list_lock);
2141                 node = rb_first(&iface_entry->tag_stat_tree);
2142                 while (node) {
2143                         ts_entry = rb_entry(node, struct tag_stat, tn.node);
2144                         entry_uid = get_uid_from_tag(ts_entry->tn.tag);
2145                         node = rb_next(node);
2146
2147                         CT_DEBUG("qtaguid: ctrl_delete(%s): "
2148                                  "ts tag=0x%llx (uid=%u)\n",
2149                                  input, ts_entry->tn.tag, entry_uid);
2150
2151                         if (entry_uid != uid)
2152                                 continue;
2153                         if (!acct_tag || ts_entry->tn.tag == tag) {
2154                                 CT_DEBUG("qtaguid: ctrl_delete(%s): "
2155                                          "erase ts: %s 0x%llx %u\n",
2156                                          input, iface_entry->ifname,
2157                                          get_atag_from_tag(ts_entry->tn.tag),
2158                                          entry_uid);
2159                                 rb_erase(&ts_entry->tn.node,
2160                                          &iface_entry->tag_stat_tree);
2161                                 kfree(ts_entry);
2162                         }
2163                 }
2164                 spin_unlock_bh(&iface_entry->tag_stat_list_lock);
2165         }
2166         spin_unlock_bh(&iface_stat_list_lock);
2167
2168         /* Cleanup the uid_tag_data */
2169         spin_lock_bh(&uid_tag_data_tree_lock);
2170         node = rb_first(&uid_tag_data_tree);
2171         while (node) {
2172                 utd_entry = rb_entry(node, struct uid_tag_data, node);
2173                 entry_uid = utd_entry->uid;
2174                 node = rb_next(node);
2175
2176                 CT_DEBUG("qtaguid: ctrl_delete(%s): "
2177                          "utd uid=%u\n",
2178                          input, entry_uid);
2179
2180                 if (entry_uid != uid)
2181                         continue;
2182                 /*
2183                  * Go over the tag_refs, and those that don't have
2184                  * sock_tags using them are freed.
2185                  */
2186                 put_tag_ref_tree(tag, utd_entry);
2187                 put_utd_entry(utd_entry);
2188         }
2189         spin_unlock_bh(&uid_tag_data_tree_lock);
2190
2191         atomic64_inc(&qtu_events.delete_cmds);
2192         res = 0;
2193
2194 err:
2195         return res;
2196 }
2197
2198 static int ctrl_cmd_counter_set(const char *input)
2199 {
2200         char cmd;
2201         uid_t uid = 0;
2202         tag_t tag;
2203         int res, argc;
2204         struct tag_counter_set *tcs;
2205         int counter_set;
2206
2207         argc = sscanf(input, "%c %d %u", &cmd, &counter_set, &uid);
2208         CT_DEBUG("qtaguid: ctrl_counterset(%s): argc=%d cmd=%c "
2209                  "set=%d uid=%u\n", input, argc, cmd,
2210                  counter_set, uid);
2211         if (argc != 3) {
2212                 res = -EINVAL;
2213                 goto err;
2214         }
2215         if (counter_set < 0 || counter_set >= IFS_MAX_COUNTER_SETS) {
2216                 pr_info("qtaguid: ctrl_counterset(%s): invalid counter_set range\n",
2217                         input);
2218                 res = -EINVAL;
2219                 goto err;
2220         }
2221         if (!can_manipulate_uids()) {
2222                 pr_info("qtaguid: ctrl_counterset(%s): "
2223                         "insufficient priv from pid=%u tgid=%u uid=%u\n",
2224                         input, current->pid, current->tgid, current_fsuid());
2225                 res = -EPERM;
2226                 goto err;
2227         }
2228
2229         tag = make_tag_from_uid(uid);
2230         spin_lock_bh(&tag_counter_set_list_lock);
2231         tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
2232         if (!tcs) {
2233                 tcs = kzalloc(sizeof(*tcs), GFP_ATOMIC);
2234                 if (!tcs) {
2235                         spin_unlock_bh(&tag_counter_set_list_lock);
2236                         pr_err("qtaguid: ctrl_counterset(%s): "
2237                                "failed to alloc counter set\n",
2238                                input);
2239                         res = -ENOMEM;
2240                         goto err;
2241                 }
2242                 tcs->tn.tag = tag;
2243                 tag_counter_set_tree_insert(tcs, &tag_counter_set_tree);
2244                 CT_DEBUG("qtaguid: ctrl_counterset(%s): added tcs tag=0x%llx "
2245                          "(uid=%u) set=%d\n",
2246                          input, tag, get_uid_from_tag(tag), counter_set);
2247         }
2248         tcs->active_set = counter_set;
2249         spin_unlock_bh(&tag_counter_set_list_lock);
2250         atomic64_inc(&qtu_events.counter_set_changes);
2251         res = 0;
2252
2253 err:
2254         return res;
2255 }
2256
2257 static int ctrl_cmd_tag(const char *input)
2258 {
2259         char cmd;
2260         int sock_fd = 0;
2261         uid_t uid = 0;
2262         tag_t acct_tag = make_atag_from_value(0);
2263         tag_t full_tag;
2264         struct socket *el_socket;
2265         int res, argc;
2266         struct sock_tag *sock_tag_entry;
2267         struct tag_ref *tag_ref_entry;
2268         struct uid_tag_data *uid_tag_data_entry;
2269         struct proc_qtu_data *pqd_entry;
2270
2271         /* Unassigned args will get defaulted later. */
2272         argc = sscanf(input, "%c %d %llu %u", &cmd, &sock_fd, &acct_tag, &uid);
2273         CT_DEBUG("qtaguid: ctrl_tag(%s): argc=%d cmd=%c sock_fd=%d "
2274                  "acct_tag=0x%llx uid=%u\n", input, argc, cmd, sock_fd,
2275                  acct_tag, uid);
2276         if (argc < 2) {
2277                 res = -EINVAL;
2278                 goto err;
2279         }
2280         el_socket = sockfd_lookup(sock_fd, &res);  /* This locks the file */
2281         if (!el_socket) {
2282                 pr_info("qtaguid: ctrl_tag(%s): failed to lookup"
2283                         " sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n",
2284                         input, sock_fd, res, current->pid, current->tgid,
2285                         current_fsuid());
2286                 goto err;
2287         }
2288         CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->f_count=%ld ->sk=%p\n",
2289                  input, atomic_long_read(&el_socket->file->f_count),
2290                  el_socket->sk);
2291         if (argc < 3) {
2292                 acct_tag = make_atag_from_value(0);
2293         } else if (!valid_atag(acct_tag)) {
2294                 pr_info("qtaguid: ctrl_tag(%s): invalid tag\n", input);
2295                 res = -EINVAL;
2296                 goto err_put;
2297         }
2298         CT_DEBUG("qtaguid: ctrl_tag(%s): "
2299                  "pid=%u tgid=%u uid=%u euid=%u fsuid=%u "
2300                  "in_group=%d in_egroup=%d\n",
2301                  input, current->pid, current->tgid, current_uid(),
2302                  current_euid(), current_fsuid(),
2303                  in_group_p(proc_ctrl_write_gid),
2304                  in_egroup_p(proc_ctrl_write_gid));
2305         if (argc < 4) {
2306                 uid = current_fsuid();
2307         } else if (!can_impersonate_uid(uid)) {
2308                 pr_info("qtaguid: ctrl_tag(%s): "
2309                         "insufficient priv from pid=%u tgid=%u uid=%u\n",
2310                         input, current->pid, current->tgid, current_fsuid());
2311                 res = -EPERM;
2312                 goto err_put;
2313         }
2314         full_tag = combine_atag_with_uid(acct_tag, uid);
2315
2316         spin_lock_bh(&sock_tag_list_lock);
2317         sock_tag_entry = get_sock_stat_nl(el_socket->sk);
2318         tag_ref_entry = get_tag_ref(full_tag, &uid_tag_data_entry);
2319         if (IS_ERR(tag_ref_entry)) {
2320                 res = PTR_ERR(tag_ref_entry);
2321                 spin_unlock_bh(&sock_tag_list_lock);
2322                 goto err_put;
2323         }
2324         tag_ref_entry->num_sock_tags++;
2325         if (sock_tag_entry) {
2326                 struct tag_ref *prev_tag_ref_entry;
2327
2328                 CT_DEBUG("qtaguid: ctrl_tag(%s): retag for sk=%p "
2329                          "st@%p ...->f_count=%ld\n",
2330                          input, el_socket->sk, sock_tag_entry,
2331                          atomic_long_read(&el_socket->file->f_count));
2332                 /*
2333                  * This is a re-tagging, so release the sock_fd that was
2334                  * locked at the time of the 1st tagging.
2335                  * There is still the ref from this call's sockfd_lookup() so
2336                  * it can be done within the spinlock.
2337                  */
2338                 sockfd_put(sock_tag_entry->socket);
2339                 prev_tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag,
2340                                                     &uid_tag_data_entry);
2341                 BUG_ON(IS_ERR_OR_NULL(prev_tag_ref_entry));
2342                 BUG_ON(prev_tag_ref_entry->num_sock_tags <= 0);
2343                 prev_tag_ref_entry->num_sock_tags--;
2344                 sock_tag_entry->tag = full_tag;
2345         } else {
2346                 CT_DEBUG("qtaguid: ctrl_tag(%s): newtag for sk=%p\n",
2347                          input, el_socket->sk);
2348                 sock_tag_entry = kzalloc(sizeof(*sock_tag_entry),
2349                                          GFP_ATOMIC);
2350                 if (!sock_tag_entry) {
2351                         pr_err("qtaguid: ctrl_tag(%s): "
2352                                "socket tag alloc failed\n",
2353                                input);
2354                         spin_unlock_bh(&sock_tag_list_lock);
2355                         res = -ENOMEM;
2356                         goto err_tag_unref_put;
2357                 }
2358                 sock_tag_entry->sk = el_socket->sk;
2359                 sock_tag_entry->socket = el_socket;
2360                 sock_tag_entry->pid = current->tgid;
2361                 sock_tag_entry->tag = combine_atag_with_uid(acct_tag,
2362                                                             uid);
2363                 spin_lock_bh(&uid_tag_data_tree_lock);
2364                 pqd_entry = proc_qtu_data_tree_search(
2365                         &proc_qtu_data_tree, current->tgid);
2366                 /*
2367                  * TODO: remove if, and start failing.
2368                  * At first, we want to catch user-space code that is not
2369                  * opening the /dev/xt_qtaguid.
2370                  */
2371                 if (IS_ERR_OR_NULL(pqd_entry))
2372                         pr_warn_once(
2373                                 "qtaguid: %s(): "
2374                                 "User space forgot to open /dev/xt_qtaguid? "
2375                                 "pid=%u tgid=%u uid=%u\n", __func__,
2376                                 current->pid, current->tgid,
2377                                 current_fsuid());
2378                 else
2379                         list_add(&sock_tag_entry->list,
2380                                  &pqd_entry->sock_tag_list);
2381                 spin_unlock_bh(&uid_tag_data_tree_lock);
2382
2383                 sock_tag_tree_insert(sock_tag_entry, &sock_tag_tree);
2384                 atomic64_inc(&qtu_events.sockets_tagged);
2385         }
2386         spin_unlock_bh(&sock_tag_list_lock);
2387         /* We keep the ref to the socket (file) until it is untagged */
2388         CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->f_count=%ld\n",
2389                  input, sock_tag_entry,
2390                  atomic_long_read(&el_socket->file->f_count));
2391         return 0;
2392
2393 err_tag_unref_put:
2394         BUG_ON(tag_ref_entry->num_sock_tags <= 0);
2395         tag_ref_entry->num_sock_tags--;
2396         free_tag_ref_from_utd_entry(tag_ref_entry, uid_tag_data_entry);
2397 err_put:
2398         CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->f_count=%ld\n",
2399                  input, atomic_long_read(&el_socket->file->f_count) - 1);
2400         /* Release the sock_fd that was grabbed by sockfd_lookup(). */
2401         sockfd_put(el_socket);
2402         return res;
2403
2404 err:
2405         CT_DEBUG("qtaguid: ctrl_tag(%s): done.\n", input);
2406         return res;
2407 }
2408
2409 static int ctrl_cmd_untag(const char *input)
2410 {
2411         char cmd;
2412         int sock_fd = 0;
2413         struct socket *el_socket;
2414         int res, argc;
2415         struct sock_tag *sock_tag_entry;
2416         struct tag_ref *tag_ref_entry;
2417         struct uid_tag_data *utd_entry;
2418         struct proc_qtu_data *pqd_entry;
2419
2420         argc = sscanf(input, "%c %d", &cmd, &sock_fd);
2421         CT_DEBUG("qtaguid: ctrl_untag(%s): argc=%d cmd=%c sock_fd=%d\n",
2422                  input, argc, cmd, sock_fd);
2423         if (argc < 2) {
2424                 res = -EINVAL;
2425                 goto err;
2426         }
2427         el_socket = sockfd_lookup(sock_fd, &res);  /* This locks the file */
2428         if (!el_socket) {
2429                 pr_info("qtaguid: ctrl_untag(%s): failed to lookup"
2430                         " sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n",
2431                         input, sock_fd, res, current->pid, current->tgid,
2432                         current_fsuid());
2433                 goto err;
2434         }
2435         CT_DEBUG("qtaguid: ctrl_untag(%s): socket->...->f_count=%ld ->sk=%p\n",
2436                  input, atomic_long_read(&el_socket->file->f_count),
2437                  el_socket->sk);
2438         spin_lock_bh(&sock_tag_list_lock);
2439         sock_tag_entry = get_sock_stat_nl(el_socket->sk);
2440         if (!sock_tag_entry) {
2441                 spin_unlock_bh(&sock_tag_list_lock);
2442                 res = -EINVAL;
2443                 goto err_put;
2444         }
2445         /*
2446          * The socket already belongs to the current process
2447          * so it can do whatever it wants to it.
2448          */
2449         rb_erase(&sock_tag_entry->sock_node, &sock_tag_tree);
2450
2451         tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, &utd_entry);
2452         BUG_ON(!tag_ref_entry);
2453         BUG_ON(tag_ref_entry->num_sock_tags <= 0);
2454         spin_lock_bh(&uid_tag_data_tree_lock);
2455         pqd_entry = proc_qtu_data_tree_search(
2456                 &proc_qtu_data_tree, current->tgid);
2457         /*
2458          * TODO: remove if, and start failing.
2459          * At first, we want to catch user-space code that is not
2460          * opening the /dev/xt_qtaguid.
2461          */
2462         if (IS_ERR_OR_NULL(pqd_entry))
2463                 pr_warn_once("qtaguid: %s(): "
2464                              "User space forgot to open /dev/xt_qtaguid? "
2465                              "pid=%u tgid=%u uid=%u\n", __func__,
2466                              current->pid, current->tgid, current_fsuid());
2467         else
2468                 list_del(&sock_tag_entry->list);
2469         spin_unlock_bh(&uid_tag_data_tree_lock);
2470         /*
2471          * We don't free tag_ref from the utd_entry here,
2472          * only during a cmd_delete().
2473          */
2474         tag_ref_entry->num_sock_tags--;
2475         spin_unlock_bh(&sock_tag_list_lock);
2476         /*
2477          * Release the sock_fd that was grabbed at tag time,
2478          * and once more for the sockfd_lookup() here.
2479          */
2480         sockfd_put(sock_tag_entry->socket);
2481         CT_DEBUG("qtaguid: ctrl_untag(%s): done. st@%p ...->f_count=%ld\n",
2482                  input, sock_tag_entry,
2483                  atomic_long_read(&el_socket->file->f_count) - 1);
2484         sockfd_put(el_socket);
2485
2486         kfree(sock_tag_entry);
2487         atomic64_inc(&qtu_events.sockets_untagged);
2488
2489         return 0;
2490
2491 err_put:
2492         CT_DEBUG("qtaguid: ctrl_untag(%s): done. socket->...->f_count=%ld\n",
2493                  input, atomic_long_read(&el_socket->file->f_count) - 1);
2494         /* Release the sock_fd that was grabbed by sockfd_lookup(). */
2495         sockfd_put(el_socket);
2496         return res;
2497
2498 err:
2499         CT_DEBUG("qtaguid: ctrl_untag(%s): done.\n", input);
2500         return res;
2501 }
2502
2503 static int qtaguid_ctrl_parse(const char *input, int count)
2504 {
2505         char cmd;
2506         int res;
2507
2508         CT_DEBUG("qtaguid: ctrl(%s): pid=%u tgid=%u uid=%u\n",
2509                  input, current->pid, current->tgid, current_fsuid());
2510
2511         cmd = input[0];
2512         /* Collect params for commands */
2513         switch (cmd) {
2514         case 'd':
2515                 res = ctrl_cmd_delete(input);
2516                 break;
2517
2518         case 's':
2519                 res = ctrl_cmd_counter_set(input);
2520                 break;
2521
2522         case 't':
2523                 res = ctrl_cmd_tag(input);
2524                 break;
2525
2526         case 'u':
2527                 res = ctrl_cmd_untag(input);
2528                 break;
2529
2530         default:
2531                 res = -EINVAL;
2532                 goto err;
2533         }
2534         if (!res)
2535                 res = count;
2536 err:
2537         CT_DEBUG("qtaguid: ctrl(%s): res=%d\n", input, res);
2538         return res;
2539 }
2540
2541 #define MAX_QTAGUID_CTRL_INPUT_LEN 255
2542 static int qtaguid_ctrl_proc_write(struct file *file, const char __user *buffer,
2543                         unsigned long count, void *data)
2544 {
2545         char input_buf[MAX_QTAGUID_CTRL_INPUT_LEN];
2546
2547         if (unlikely(module_passive))
2548                 return count;
2549
2550         if (count >= MAX_QTAGUID_CTRL_INPUT_LEN)
2551                 return -EINVAL;
2552
2553         if (copy_from_user(input_buf, buffer, count))
2554                 return -EFAULT;
2555
2556         input_buf[count] = '\0';
2557         return qtaguid_ctrl_parse(input_buf, count);
2558 }
2559
2560 struct proc_print_info {
2561         char *outp;
2562         char **num_items_returned;
2563         struct iface_stat *iface_entry;
2564         struct tag_stat *ts_entry;
2565         int item_index;
2566         int items_to_skip;
2567         int char_count;
2568 };
2569
2570 static int pp_stats_line(struct proc_print_info *ppi, int cnt_set)
2571 {
2572         int len;
2573         struct data_counters *cnts;
2574
2575         if (!ppi->item_index) {
2576                 if (ppi->item_index++ < ppi->items_to_skip)
2577                         return 0;
2578                 len = snprintf(ppi->outp, ppi->char_count,
2579                                "idx iface acct_tag_hex uid_tag_int cnt_set "
2580                                "rx_bytes rx_packets "
2581                                "tx_bytes tx_packets "
2582                                "rx_tcp_bytes rx_tcp_packets "
2583                                "rx_udp_bytes rx_udp_packets "
2584                                "rx_other_bytes rx_other_packets "
2585                                "tx_tcp_bytes tx_tcp_packets "
2586                                "tx_udp_bytes tx_udp_packets "
2587                                "tx_other_bytes tx_other_packets\n");
2588         } else {
2589                 tag_t tag = ppi->ts_entry->tn.tag;
2590                 uid_t stat_uid = get_uid_from_tag(tag);
2591                 /* Detailed tags are not available to everybody */
2592                 if (get_atag_from_tag(tag)
2593                     && !can_read_other_uid_stats(stat_uid)) {
2594                         CT_DEBUG("qtaguid: stats line: "
2595                                  "%s 0x%llx %u: insufficient priv "
2596                                  "from pid=%u tgid=%u uid=%u\n",
2597                                  ppi->iface_entry->ifname,
2598                                  get_atag_from_tag(tag), stat_uid,
2599                                  current->pid, current->tgid, current_fsuid());
2600                         return 0;
2601                 }
2602                 if (ppi->item_index++ < ppi->items_to_skip)
2603                         return 0;
2604                 cnts = &ppi->ts_entry->counters;
2605                 len = snprintf(
2606                         ppi->outp, ppi->char_count,
2607                         "%d %s 0x%llx %u %u "
2608                         "%llu %llu "
2609                         "%llu %llu "
2610                         "%llu %llu "
2611                         "%llu %llu "
2612                         "%llu %llu "
2613                         "%llu %llu "
2614                         "%llu %llu "
2615                         "%llu %llu\n",
2616                         ppi->item_index,
2617                         ppi->iface_entry->ifname,
2618                         get_atag_from_tag(tag),
2619                         stat_uid,
2620                         cnt_set,
2621                         dc_sum_bytes(cnts, cnt_set, IFS_RX),
2622                         dc_sum_packets(cnts, cnt_set, IFS_RX),
2623                         dc_sum_bytes(cnts, cnt_set, IFS_TX),
2624                         dc_sum_packets(cnts, cnt_set, IFS_TX),
2625                         cnts->bpc[cnt_set][IFS_RX][IFS_TCP].bytes,
2626                         cnts->bpc[cnt_set][IFS_RX][IFS_TCP].packets,
2627                         cnts->bpc[cnt_set][IFS_RX][IFS_UDP].bytes,
2628                         cnts->bpc[cnt_set][IFS_RX][IFS_UDP].packets,
2629                         cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].bytes,
2630                         cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].packets,
2631                         cnts->bpc[cnt_set][IFS_TX][IFS_TCP].bytes,
2632                         cnts->bpc[cnt_set][IFS_TX][IFS_TCP].packets,
2633                         cnts->bpc[cnt_set][IFS_TX][IFS_UDP].bytes,
2634                         cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets,
2635                         cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes,
2636                         cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets);
2637         }
2638         return len;
2639 }
2640
2641 static bool pp_sets(struct proc_print_info *ppi)
2642 {
2643         int len;
2644         int counter_set;
2645         for (counter_set = 0; counter_set < IFS_MAX_COUNTER_SETS;
2646              counter_set++) {
2647                 len = pp_stats_line(ppi, counter_set);
2648                 if (len >= ppi->char_count) {
2649                         *ppi->outp = '\0';
2650                         return false;
2651                 }
2652                 if (len) {
2653                         ppi->outp += len;
2654                         ppi->char_count -= len;
2655                         (*ppi->num_items_returned)++;
2656                 }
2657         }
2658         return true;
2659 }
2660
2661 /*
2662  * Procfs reader to get all tag stats using style "1)" as described in
2663  * fs/proc/generic.c
2664  * Groups all protocols tx/rx bytes.
2665  */
2666 static int qtaguid_stats_proc_read(char *page, char **num_items_returned,
2667                                 off_t items_to_skip, int char_count, int *eof,
2668                                 void *data)
2669 {
2670         struct proc_print_info ppi;
2671         int len;
2672
2673         ppi.outp = page;
2674         ppi.item_index = 0;
2675         ppi.char_count = char_count;
2676         ppi.num_items_returned = num_items_returned;
2677         ppi.items_to_skip = items_to_skip;
2678
2679         if (unlikely(module_passive)) {
2680                 len = pp_stats_line(&ppi, 0);
2681                 /* The header should always be shorter than the buffer. */
2682                 BUG_ON(len >= ppi.char_count);
2683                 (*num_items_returned)++;
2684                 *eof = 1;
2685                 return len;
2686         }
2687
2688         CT_DEBUG("qtaguid:proc stats pid=%u tgid=%u uid=%u "
2689                  "page=%p *num_items_returned=%p off=%ld "
2690                  "char_count=%d *eof=%d\n",
2691                  current->pid, current->tgid, current_fsuid(),
2692                  page, *num_items_returned,
2693                  items_to_skip, char_count, *eof);
2694
2695         if (*eof)
2696                 return 0;
2697
2698         /* The idx is there to help debug when things go belly up. */
2699         len = pp_stats_line(&ppi, 0);
2700         /* Don't advance the outp unless the whole line was printed */
2701         if (len >= ppi.char_count) {
2702                 *ppi.outp = '\0';
2703                 return ppi.outp - page;
2704         }
2705         if (len) {
2706                 ppi.outp += len;
2707                 ppi.char_count -= len;
2708                 (*num_items_returned)++;
2709         }
2710
2711         spin_lock_bh(&iface_stat_list_lock);
2712         list_for_each_entry(ppi.iface_entry, &iface_stat_list, list) {
2713                 struct rb_node *node;
2714                 spin_lock_bh(&ppi.iface_entry->tag_stat_list_lock);
2715                 for (node = rb_first(&ppi.iface_entry->tag_stat_tree);
2716                      node;
2717                      node = rb_next(node)) {
2718                         ppi.ts_entry = rb_entry(node, struct tag_stat, tn.node);
2719                         if (!pp_sets(&ppi)) {
2720                                 spin_unlock_bh(
2721                                         &ppi.iface_entry->tag_stat_list_lock);
2722                                 spin_unlock_bh(&iface_stat_list_lock);
2723                                 return ppi.outp - page;
2724                         }
2725                 }
2726                 spin_unlock_bh(&ppi.iface_entry->tag_stat_list_lock);
2727         }
2728         spin_unlock_bh(&iface_stat_list_lock);
2729
2730         *eof = 1;
2731         return ppi.outp - page;
2732 }
2733
2734 /*------------------------------------------*/
2735 static int qtudev_open(struct inode *inode, struct file *file)
2736 {
2737         struct uid_tag_data *utd_entry;
2738         struct proc_qtu_data  *pqd_entry;
2739         struct proc_qtu_data  *new_pqd_entry;
2740         int res;
2741         bool utd_entry_found;
2742
2743         if (unlikely(qtu_proc_handling_passive))
2744                 return 0;
2745
2746         DR_DEBUG("qtaguid: qtudev_open(): pid=%u tgid=%u uid=%u\n",
2747                  current->pid, current->tgid, current_fsuid());
2748
2749         spin_lock_bh(&uid_tag_data_tree_lock);
2750
2751         /* Look for existing uid data, or alloc one. */
2752         utd_entry = get_uid_data(current_fsuid(), &utd_entry_found);
2753         if (IS_ERR_OR_NULL(utd_entry)) {
2754                 res = PTR_ERR(utd_entry);
2755                 goto err;
2756         }
2757
2758         /* Look for existing PID based proc_data */
2759         pqd_entry = proc_qtu_data_tree_search(&proc_qtu_data_tree,
2760                                               current->tgid);
2761         if (pqd_entry) {
2762                 pr_err("qtaguid: qtudev_open(): %u/%u %u "
2763                        "%s already opened\n",
2764                        current->pid, current->tgid, current_fsuid(),
2765                        QTU_DEV_NAME);
2766                 res = -EBUSY;
2767                 goto err_unlock_free_utd;
2768         }
2769
2770         new_pqd_entry = kzalloc(sizeof(*new_pqd_entry), GFP_ATOMIC);
2771         if (!new_pqd_entry) {
2772                 pr_err("qtaguid: qtudev_open(): %u/%u %u: "
2773                        "proc data alloc failed\n",
2774                        current->pid, current->tgid, current_fsuid());
2775                 res = -ENOMEM;
2776                 goto err_unlock_free_utd;
2777         }
2778         new_pqd_entry->pid = current->tgid;
2779         INIT_LIST_HEAD(&new_pqd_entry->sock_tag_list);
2780         new_pqd_entry->parent_tag_data = utd_entry;
2781         utd_entry->num_pqd++;
2782
2783         proc_qtu_data_tree_insert(new_pqd_entry,
2784                                   &proc_qtu_data_tree);
2785
2786         spin_unlock_bh(&uid_tag_data_tree_lock);
2787         DR_DEBUG("qtaguid: tracking data for uid=%u in pqd=%p\n",
2788                  current_fsuid(), new_pqd_entry);
2789         file->private_data = new_pqd_entry;
2790         return 0;
2791
2792 err_unlock_free_utd:
2793         if (!utd_entry_found) {
2794                 rb_erase(&utd_entry->node, &uid_tag_data_tree);
2795                 kfree(utd_entry);
2796         }
2797         spin_unlock_bh(&uid_tag_data_tree_lock);
2798 err:
2799         return res;
2800 }
2801
2802 static int qtudev_release(struct inode *inode, struct file *file)
2803 {
2804         struct proc_qtu_data  *pqd_entry = file->private_data;
2805         struct uid_tag_data  *utd_entry = pqd_entry->parent_tag_data;
2806         struct sock_tag *st_entry;
2807         struct rb_root st_to_free_tree = RB_ROOT;
2808         struct list_head *entry, *next;
2809         struct tag_ref *tr;
2810
2811         if (unlikely(qtu_proc_handling_passive))
2812                 return 0;
2813
2814         /*
2815          * Do not trust the current->pid, it might just be a kworker cleaning
2816          * up after a dead proc.
2817          */
2818         DR_DEBUG("qtaguid: qtudev_release(): "
2819                  "pid=%u tgid=%u uid=%u "
2820                  "pqd_entry=%p->pid=%u utd_entry=%p->active_tags=%d\n",
2821                  current->pid, current->tgid, pqd_entry->parent_tag_data->uid,
2822                  pqd_entry, pqd_entry->pid, utd_entry,
2823                  utd_entry->num_active_tags);
2824
2825         spin_lock_bh(&sock_tag_list_lock);
2826         spin_lock_bh(&uid_tag_data_tree_lock);
2827
2828         list_for_each_safe(entry, next, &pqd_entry->sock_tag_list) {
2829                 st_entry = list_entry(entry, struct sock_tag, list);
2830                 DR_DEBUG("qtaguid: %s(): "
2831                          "erase sock_tag=%p->sk=%p pid=%u tgid=%u uid=%u\n",
2832                          __func__,
2833                          st_entry, st_entry->sk,
2834                          current->pid, current->tgid,
2835                          pqd_entry->parent_tag_data->uid);
2836
2837                 utd_entry = uid_tag_data_tree_search(
2838                         &uid_tag_data_tree,
2839                         get_uid_from_tag(st_entry->tag));
2840                 BUG_ON(IS_ERR_OR_NULL(utd_entry));
2841                 DR_DEBUG("qtaguid: %s(): "
2842                          "looking for tag=0x%llx in utd_entry=%p\n", __func__,
2843                          st_entry->tag, utd_entry);
2844                 tr = tag_ref_tree_search(&utd_entry->tag_ref_tree,
2845                                          st_entry->tag);
2846                 BUG_ON(!tr);
2847                 BUG_ON(tr->num_sock_tags <= 0);
2848                 tr->num_sock_tags--;
2849                 free_tag_ref_from_utd_entry(tr, utd_entry);
2850
2851                 rb_erase(&st_entry->sock_node, &sock_tag_tree);
2852                 list_del(&st_entry->list);
2853                 /* Can't sockfd_put() within spinlock, do it later. */
2854                 sock_tag_tree_insert(st_entry, &st_to_free_tree);
2855
2856                 /*
2857                  * Try to free the utd_entry if no other proc_qtu_data is
2858                  * using it (num_pqd is 0) and it doesn't have active tags
2859                  * (num_active_tags is 0).
2860                  */
2861                 put_utd_entry(utd_entry);
2862         }
2863
2864         rb_erase(&pqd_entry->node, &proc_qtu_data_tree);
2865         BUG_ON(pqd_entry->parent_tag_data->num_pqd < 1);
2866         pqd_entry->parent_tag_data->num_pqd--;
2867         put_utd_entry(pqd_entry->parent_tag_data);
2868         kfree(pqd_entry);
2869         file->private_data = NULL;
2870
2871         spin_unlock_bh(&uid_tag_data_tree_lock);
2872         spin_unlock_bh(&sock_tag_list_lock);
2873
2874
2875         sock_tag_tree_erase(&st_to_free_tree);
2876
2877         prdebug_full_state(0, "%s(): pid=%u tgid=%u", __func__,
2878                            current->pid, current->tgid);
2879         return 0;
2880 }
2881
2882 /*------------------------------------------*/
2883 static const struct file_operations qtudev_fops = {
2884         .owner = THIS_MODULE,
2885         .open = qtudev_open,
2886         .release = qtudev_release,
2887 };
2888
2889 static struct miscdevice qtu_device = {
2890         .minor = MISC_DYNAMIC_MINOR,
2891         .name = QTU_DEV_NAME,
2892         .fops = &qtudev_fops,
2893         /* How sad it doesn't allow for defaults: .mode = S_IRUGO | S_IWUSR */
2894 };
2895
2896 /*------------------------------------------*/
2897 static int __init qtaguid_proc_register(struct proc_dir_entry **res_procdir)
2898 {
2899         int ret;
2900         *res_procdir = proc_mkdir(module_procdirname, init_net.proc_net);
2901         if (!*res_procdir) {
2902                 pr_err("qtaguid: failed to create proc/.../xt_qtaguid\n");
2903                 ret = -ENOMEM;
2904                 goto no_dir;
2905         }
2906
2907         xt_qtaguid_ctrl_file = create_proc_entry("ctrl", proc_ctrl_perms,
2908                                                 *res_procdir);
2909         if (!xt_qtaguid_ctrl_file) {
2910                 pr_err("qtaguid: failed to create xt_qtaguid/ctrl "
2911                         " file\n");
2912                 ret = -ENOMEM;
2913                 goto no_ctrl_entry;
2914         }
2915         xt_qtaguid_ctrl_file->read_proc = qtaguid_ctrl_proc_read;
2916         xt_qtaguid_ctrl_file->write_proc = qtaguid_ctrl_proc_write;
2917
2918         xt_qtaguid_stats_file = create_proc_entry("stats", proc_stats_perms,
2919                                                 *res_procdir);
2920         if (!xt_qtaguid_stats_file) {
2921                 pr_err("qtaguid: failed to create xt_qtaguid/stats "
2922                         "file\n");
2923                 ret = -ENOMEM;
2924                 goto no_stats_entry;
2925         }
2926         xt_qtaguid_stats_file->read_proc = qtaguid_stats_proc_read;
2927         /*
2928          * TODO: add support counter hacking
2929          * xt_qtaguid_stats_file->write_proc = qtaguid_stats_proc_write;
2930          */
2931         return 0;
2932
2933 no_stats_entry:
2934         remove_proc_entry("ctrl", *res_procdir);
2935 no_ctrl_entry:
2936         remove_proc_entry("xt_qtaguid", NULL);
2937 no_dir:
2938         return ret;
2939 }
2940
2941 static struct xt_match qtaguid_mt_reg __read_mostly = {
2942         /*
2943          * This module masquerades as the "owner" module so that iptables
2944          * tools can deal with it.
2945          */
2946         .name       = "owner",
2947         .revision   = 1,
2948         .family     = NFPROTO_UNSPEC,
2949         .match      = qtaguid_mt,
2950         .matchsize  = sizeof(struct xt_qtaguid_match_info),
2951         .me         = THIS_MODULE,
2952 };
2953
2954 static int __init qtaguid_mt_init(void)
2955 {
2956         if (qtaguid_proc_register(&xt_qtaguid_procdir)
2957             || iface_stat_init(xt_qtaguid_procdir)
2958             || xt_register_match(&qtaguid_mt_reg)
2959             || misc_register(&qtu_device))
2960                 return -1;
2961         return 0;
2962 }
2963
2964 /*
2965  * TODO: allow unloading of the module.
2966  * For now stats are permanent.
2967  * Kconfig forces'y/n' and never an 'm'.
2968  */
2969
2970 module_init(qtaguid_mt_init);
2971 MODULE_AUTHOR("jpa <jpa@google.com>");
2972 MODULE_DESCRIPTION("Xtables: socket owner+tag matching and associated stats");
2973 MODULE_LICENSE("GPL");
2974 MODULE_ALIAS("ipt_owner");
2975 MODULE_ALIAS("ip6t_owner");
2976 MODULE_ALIAS("ipt_qtaguid");
2977 MODULE_ALIAS("ip6t_qtaguid");