Merge tag 'linux-can-next-for-3.15-20140317' of git://gitorious.org/linux-can/linux...
authorDavid S. Miller <davem@davemloft.net>
Tue, 18 Mar 2014 19:20:00 +0000 (15:20 -0400)
committerDavid S. Miller <davem@davemloft.net>
Tue, 18 Mar 2014 19:20:00 +0000 (15:20 -0400)
linux-can-next-for-3.15-20140317

Marc Kleine-Budde says:

====================
this is a pull request of three patches for net-next/master.

It consists of a patch by Oliver Hartkopp, which unifies the MTU
settings for CAN interfaces. A patch by Christopher R. Baker populates
netdev::dev_id for udev discrimination for multi interface CAN devices.
Alexander Shiyan contributes a patch for the mcp251x driver which fixes
the regulators operation if CONFIG_REGULATOR is not enabled.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
97 files changed:
Documentation/devicetree/bindings/net/altera_tse.txt [new file with mode: 0644]
Documentation/networking/altera_tse.txt [new file with mode: 0644]
Documentation/networking/igb.txt
Documentation/networking/timestamping.txt
MAINTAINERS
drivers/net/Kconfig
drivers/net/bonding/bond_3ad.c
drivers/net/ethernet/Kconfig
drivers/net/ethernet/Makefile
drivers/net/ethernet/altera/Kconfig [new file with mode: 0644]
drivers/net/ethernet/altera/Makefile [new file with mode: 0644]
drivers/net/ethernet/altera/altera_msgdma.c [new file with mode: 0644]
drivers/net/ethernet/altera/altera_msgdma.h [new file with mode: 0644]
drivers/net/ethernet/altera/altera_msgdmahw.h [new file with mode: 0644]
drivers/net/ethernet/altera/altera_sgdma.c [new file with mode: 0644]
drivers/net/ethernet/altera/altera_sgdma.h [new file with mode: 0644]
drivers/net/ethernet/altera/altera_sgdmahw.h [new file with mode: 0644]
drivers/net/ethernet/altera/altera_tse.h [new file with mode: 0644]
drivers/net/ethernet/altera/altera_tse_ethtool.c [new file with mode: 0644]
drivers/net/ethernet/altera/altera_tse_main.c [new file with mode: 0644]
drivers/net/ethernet/altera/altera_utils.c [new file with mode: 0644]
drivers/net/ethernet/altera/altera_utils.h [new file with mode: 0644]
drivers/net/ethernet/intel/e1000e/netdev.c
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/intel/i40evf/i40e_common.c
drivers/net/ethernet/intel/i40evf/i40e_prototype.h
drivers/net/ethernet/intel/igb/igb_main.c
drivers/net/ethernet/renesas/sh_eth.c
drivers/net/ethernet/via/via-rhine.c
drivers/net/ieee802154/at86rf230.c
include/linux/netdevice.h
include/linux/netfilter/ipset/ip_set.h
include/linux/netfilter/nfnetlink.h
include/linux/netpoll.h
include/net/netfilter/nf_conntrack.h
include/net/netfilter/nf_conntrack_core.h
include/net/netfilter/nf_conntrack_labels.h
include/net/netfilter/nf_tables.h
include/net/netns/conntrack.h
include/net/xfrm.h
include/uapi/linux/netfilter/ipset/ip_set.h
include/uapi/linux/netfilter/nf_tables.h
include/uapi/linux/xfrm.h
net/core/dev.c
net/core/netpoll.c
net/ipv4/netfilter.c
net/ipv4/xfrm4_policy.c
net/ipv4/xfrm4_protocol.c
net/ipv6/Makefile
net/ipv6/ah6.c
net/ipv6/esp6.c
net/ipv6/ip6_vti.c
net/ipv6/ipcomp6.c
net/ipv6/xfrm6_mode_tunnel.c
net/ipv6/xfrm6_policy.c
net/ipv6/xfrm6_protocol.c [new file with mode: 0644]
net/key/af_key.c
net/netfilter/ipset/Kconfig
net/netfilter/ipset/Makefile
net/netfilter/ipset/ip_set_core.c
net/netfilter/ipset/ip_set_hash_gen.h
net/netfilter/ipset/ip_set_hash_ip.c
net/netfilter/ipset/ip_set_hash_ipmark.c [new file with mode: 0644]
net/netfilter/ipset/ip_set_hash_ipport.c
net/netfilter/ipset/ip_set_hash_ipportip.c
net/netfilter/ipset/ip_set_hash_ipportnet.c
net/netfilter/ipset/ip_set_hash_net.c
net/netfilter/ipset/ip_set_hash_netiface.c
net/netfilter/ipset/ip_set_hash_netnet.c
net/netfilter/ipset/ip_set_hash_netport.c
net/netfilter/ipset/ip_set_hash_netportnet.c
net/netfilter/ipset/pfxlen.c
net/netfilter/ipvs/ip_vs_ctl.c
net/netfilter/ipvs/ip_vs_lblc.c
net/netfilter/nf_conntrack_core.c
net/netfilter/nf_conntrack_expect.c
net/netfilter/nf_conntrack_h323_main.c
net/netfilter/nf_conntrack_helper.c
net/netfilter/nf_conntrack_netlink.c
net/netfilter/nf_conntrack_sip.c
net/netfilter/nf_tables_api.c
net/netfilter/nfnetlink.c
net/netfilter/nfnetlink_log.c
net/netfilter/nft_compat.c
net/netfilter/nft_ct.c
net/netfilter/nft_hash.c
net/netfilter/nft_immediate.c
net/netfilter/nft_log.c
net/netfilter/nft_lookup.c
net/netfilter/nft_nat.c
net/netfilter/xt_AUDIT.c
net/netfilter/xt_connlimit.c
net/netfilter/xt_ipcomp.c
net/sched/cls_fw.c
net/xfrm/xfrm_input.c
net/xfrm/xfrm_state.c
net/xfrm/xfrm_user.c

diff --git a/Documentation/devicetree/bindings/net/altera_tse.txt b/Documentation/devicetree/bindings/net/altera_tse.txt
new file mode 100644 (file)
index 0000000..a706297
--- /dev/null
@@ -0,0 +1,114 @@
+* Altera Triple-Speed Ethernet MAC driver (TSE)
+
+Required properties:
+- compatible: Should be "altr,tse-1.0" for legacy SGDMA based TSE, and should
+               be "altr,tse-msgdma-1.0" for the preferred MSGDMA based TSE.
+               ALTR is supported for legacy device trees, but is deprecated.
+               altr should be used for all new designs.
+- reg: Address and length of the register set for the device. It contains
+  the information of registers in the same order as described by reg-names
+- reg-names: Should contain the reg names
+  "control_port": MAC configuration space region
+  "tx_csr":       xDMA Tx dispatcher control and status space region
+  "tx_desc":      MSGDMA Tx dispatcher descriptor space region
+  "rx_csr" :      xDMA Rx dispatcher control and status space region
+  "rx_desc":      MSGDMA Rx dispatcher descriptor space region
+  "rx_resp":      MSGDMA Rx dispatcher response space region
+  "s1":                  SGDMA descriptor memory
+- interrupts: Should contain the TSE interrupts and it's mode.
+- interrupt-names: Should contain the interrupt names
+  "rx_irq":       xDMA Rx dispatcher interrupt
+  "tx_irq":       xDMA Tx dispatcher interrupt
+- rx-fifo-depth: MAC receive FIFO buffer depth in bytes
+- tx-fifo-depth: MAC transmit FIFO buffer depth in bytes
+- phy-mode: See ethernet.txt in the same directory.
+- phy-handle: See ethernet.txt in the same directory.
+- phy-addr: See ethernet.txt in the same directory. A configuration should
+               include phy-handle or phy-addr.
+- altr,has-supplementary-unicast:
+               If present, TSE supports additional unicast addresses.
+               Otherwise additional unicast addresses are not supported.
+- altr,has-hash-multicast-filter:
+               If present, TSE supports a hash based multicast filter.
+               Otherwise, hash-based multicast filtering is not supported.
+
+- mdio device tree subnode: When the TSE has a phy connected to its local
+               mdio, there must be device tree subnode with the following
+               required properties:
+
+       - compatible: Must be "altr,tse-mdio".
+       - #address-cells: Must be <1>.
+       - #size-cells: Must be <0>.
+
+       For each phy on the mdio bus, there must be a node with the following
+       fields:
+
+       - reg: phy id used to communicate to phy.
+       - device_type: Must be "ethernet-phy".
+
+Optional properties:
+- local-mac-address: See ethernet.txt in the same directory.
+- max-frame-size: See ethernet.txt in the same directory.
+
+Example:
+
+       tse_sub_0_eth_tse_0: ethernet@0x1,00000000 {
+               compatible = "altr,tse-msgdma-1.0";
+               reg =   <0x00000001 0x00000000 0x00000400>,
+                       <0x00000001 0x00000460 0x00000020>,
+                       <0x00000001 0x00000480 0x00000020>,
+                       <0x00000001 0x000004A0 0x00000008>,
+                       <0x00000001 0x00000400 0x00000020>,
+                       <0x00000001 0x00000420 0x00000020>;
+               reg-names = "control_port", "rx_csr", "rx_desc", "rx_resp", "tx_csr", "tx_desc";
+               interrupt-parent = <&hps_0_arm_gic_0>;
+               interrupts = <0 41 4>, <0 40 4>;
+               interrupt-names = "rx_irq", "tx_irq";
+               rx-fifo-depth = <2048>;
+               tx-fifo-depth = <2048>;
+               address-bits = <48>;
+               max-frame-size = <1500>;
+               local-mac-address = [ 00 00 00 00 00 00 ];
+               phy-mode = "gmii";
+               altr,has-supplementary-unicast;
+               altr,has-hash-multicast-filter;
+               phy-handle = <&phy0>;
+               mdio {
+                       compatible = "altr,tse-mdio";
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+                       phy0: ethernet-phy@0 {
+                               reg = <0x0>;
+                               device_type = "ethernet-phy";
+                       };
+
+                       phy1: ethernet-phy@1 {
+                               reg = <0x1>;
+                               device_type = "ethernet-phy";
+                       };
+
+               };
+       };
+
+       tse_sub_1_eth_tse_0: ethernet@0x1,00001000 {
+               compatible = "altr,tse-msgdma-1.0";
+               reg =   <0x00000001 0x00001000 0x00000400>,
+                       <0x00000001 0x00001460 0x00000020>,
+                       <0x00000001 0x00001480 0x00000020>,
+                       <0x00000001 0x000014A0 0x00000008>,
+                       <0x00000001 0x00001400 0x00000020>,
+                       <0x00000001 0x00001420 0x00000020>;
+               reg-names = "control_port", "rx_csr", "rx_desc", "rx_resp", "tx_csr", "tx_desc";
+               interrupt-parent = <&hps_0_arm_gic_0>;
+               interrupts = <0 43 4>, <0 42 4>;
+               interrupt-names = "rx_irq", "tx_irq";
+               rx-fifo-depth = <2048>;
+               tx-fifo-depth = <2048>;
+               address-bits = <48>;
+               max-frame-size = <1500>;
+               local-mac-address = [ 00 00 00 00 00 00 ];
+               phy-mode = "gmii";
+               altr,has-supplementary-unicast;
+               altr,has-hash-multicast-filter;
+               phy-handle = <&phy1>;
+       };
diff --git a/Documentation/networking/altera_tse.txt b/Documentation/networking/altera_tse.txt
new file mode 100644 (file)
index 0000000..3f24df8
--- /dev/null
@@ -0,0 +1,263 @@
+       Altera Triple-Speed Ethernet MAC driver
+
+Copyright (C) 2008-2014 Altera Corporation
+
+This is the driver for the Altera Triple-Speed Ethernet (TSE) controllers
+using the SGDMA and MSGDMA soft DMA IP components. The driver uses the
+platform bus to obtain component resources. The designs used to test this
+driver were built for a Cyclone(R) V SOC FPGA board, a Cyclone(R) V FPGA board,
+and tested with ARM and NIOS processor hosts seperately. The anticipated use
+cases are simple communications between an embedded system and an external peer
+for status and simple configuration of the embedded system.
+
+For more information visit www.altera.com and www.rocketboards.org. Support
+forums for the driver may be found on www.rocketboards.org, and a design used
+to test this driver may be found there as well. Support is also available from
+the maintainer of this driver, found in MAINTAINERS.
+
+The Triple-Speed Ethernet, SGDMA, and MSGDMA components are all soft IP
+components that can be assembled and built into an FPGA using the Altera
+Quartus toolchain. Quartus 13.1 and 14.0 were used to build the design that
+this driver was tested against. The sopc2dts tool is used to create the
+device tree for the driver, and may be found at rocketboards.org.
+
+The driver probe function examines the device tree and determines if the
+Triple-Speed Ethernet instance is using an SGDMA or MSGDMA component. The
+probe function then installs the appropriate set of DMA routines to
+initialize, setup transmits, receives, and interrupt handling primitives for
+the respective configurations.
+
+The SGDMA component is to be deprecated in the near future (over the next 1-2
+years as of this writing in early 2014) in favor of the MSGDMA component.
+SGDMA support is included for existing designs and reference in case a
+developer wishes to support their own soft DMA logic and driver support. Any
+new designs should not use the SGDMA.
+
+The SGDMA supports only a single transmit or receive operation at a time, and
+therefore will not perform as well compared to the MSGDMA soft IP. Please
+visit www.altera.com for known, documented SGDMA errata.
+
+Scatter-gather DMA is not supported by the SGDMA or MSGDMA at this time.
+Scatter-gather DMA will be added to a future maintenance update to this
+driver.
+
+Jumbo frames are not supported at this time.
+
+The driver limits PHY operations to 10/100Mbps, and has not yet been fully
+tested for 1Gbps. This support will be added in a future maintenance update.
+
+1) Kernel Configuration
+The kernel configuration option is ALTERA_TSE:
+ Device Drivers ---> Network device support ---> Ethernet driver support --->
+ Altera Triple-Speed Ethernet MAC support (ALTERA_TSE)
+
+2) Driver parameters list:
+       debug: message level (0: no output, 16: all);
+       dma_rx_num: Number of descriptors in the RX list (default is 64);
+       dma_tx_num: Number of descriptors in the TX list (default is 64).
+
+3) Command line options
+Driver parameters can be also passed in command line by using:
+       altera_tse=dma_rx_num:128,dma_tx_num:512
+
+4) Driver information and notes
+
+4.1) Transmit process
+When the driver's transmit routine is called by the kernel, it sets up a
+transmit descriptor by calling the underlying DMA transmit routine (SGDMA or
+MSGDMA), and initites a transmit operation. Once the transmit is complete, an
+interrupt is driven by the transmit DMA logic. The driver handles the transmit
+completion in the context of the interrupt handling chain by recycling
+resource required to send and track the requested transmit operation.
+
+4.2) Receive process
+The driver will post receive buffers to the receive DMA logic during driver
+intialization. Receive buffers may or may not be queued depending upon the
+underlying DMA logic (MSGDMA is able queue receive buffers, SGDMA is not able
+to queue receive buffers to the SGDMA receive logic). When a packet is
+received, the DMA logic generates an interrupt. The driver handles a receive
+interrupt by obtaining the DMA receive logic status, reaping receive
+completions until no more receive completions are available.
+
+4.3) Interrupt Mitigation
+The driver is able to mitigate the number of its DMA interrupts
+using NAPI for receive operations. Interrupt mitigation is not yet supported
+for transmit operations, but will be added in a future maintenance release.
+
+4.4) Ethtool support
+Ethtool is supported. Driver statistics and internal errors can be taken using:
+ethtool -S ethX command. It is possible to dump registers etc.
+
+4.5) PHY Support
+The driver is compatible with PAL to work with PHY and GPHY devices.
+
+4.7) List of source files:
+ o Kconfig
+ o Makefile
+ o altera_tse_main.c: main network device driver
+ o altera_tse_ethtool.c: ethtool support
+ o altera_tse.h: private driver structure and common definitions
+ o altera_msgdma.h: MSGDMA implementation function definitions
+ o altera_sgdma.h: SGDMA implementation function definitions
+ o altera_msgdma.c: MSGDMA implementation
+ o altera_sgdma.c: SGDMA implementation
+ o altera_sgdmahw.h: SGDMA register and descriptor definitions
+ o altera_msgdmahw.h: MSGDMA register and descriptor definitions
+ o altera_utils.c: Driver utility functions
+ o altera_utils.h: Driver utility function definitions
+
+5) Debug Information
+
+The driver exports debug information such as internal statistics,
+debug information, MAC and DMA registers etc.
+
+A user may use the ethtool support to get statistics:
+e.g. using: ethtool -S ethX (that shows the statistics counters)
+or sees the MAC registers: e.g. using: ethtool -d ethX
+
+The developer can also use the "debug" module parameter to get
+further debug information.
+
+6) Statistics Support
+
+The controller and driver support a mix of IEEE standard defined statistics,
+RFC defined statistics, and driver or Altera defined statistics. The four
+specifications containing the standard definitions for these statistics are
+as follows:
+
+ o IEEE 802.3-2012 - IEEE Standard for Ethernet.
+ o RFC 2863 found at http://www.rfc-editor.org/rfc/rfc2863.txt.
+ o RFC 2819 found at http://www.rfc-editor.org/rfc/rfc2819.txt.
+ o Altera Triple Speed Ethernet User Guide, found at http://www.altera.com
+
+The statistics supported by the TSE and the device driver are as follows:
+
+"tx_packets" is equivalent to aFramesTransmittedOK defined in IEEE 802.3-2012,
+Section 5.2.2.1.2. This statistics is the count of frames that are successfully
+transmitted.
+
+"rx_packets" is equivalent to aFramesReceivedOK defined in IEEE 802.3-2012,
+Section 5.2.2.1.5. This statistic is the count of frames that are successfully
+received. This count does not include any error packets such as CRC errors,
+length errors, or alignment errors.
+
+"rx_crc_errors" is equivalent to aFrameCheckSequenceErrors defined in IEEE
+802.3-2012, Section 5.2.2.1.6. This statistic is the count of frames that are
+an integral number of bytes in length and do not pass the CRC test as the frame
+is received.
+
+"rx_align_errors" is equivalent to aAlignmentErrors defined in IEEE 802.3-2012,
+Section 5.2.2.1.7. This statistic is the count of frames that are not an
+integral number of bytes in length and do not pass the CRC test as the frame is
+received.
+
+"tx_bytes" is equivalent to aOctetsTransmittedOK defined in IEEE 802.3-2012,
+Section 5.2.2.1.8. This statistic is the count of data and pad bytes
+successfully transmitted from the interface.
+
+"rx_bytes" is equivalent to aOctetsReceivedOK defined in IEEE 802.3-2012,
+Section 5.2.2.1.14. This statistic is the count of data and pad bytes
+successfully received by the controller.
+
+"tx_pause" is equivalent to aPAUSEMACCtrlFramesTransmitted defined in IEEE
+802.3-2012, Section 30.3.4.2. This statistic is a count of PAUSE frames
+transmitted from the network controller.
+
+"rx_pause" is equivalent to aPAUSEMACCtrlFramesReceived defined in IEEE
+802.3-2012, Section 30.3.4.3. This statistic is a count of PAUSE frames
+received by the network controller.
+
+"rx_errors" is equivalent to ifInErrors defined in RFC 2863. This statistic is
+a count of the number of packets received containing errors that prevented the
+packet from being delivered to a higher level protocol.
+
+"tx_errors" is equivalent to ifOutErrors defined in RFC 2863. This statistic
+is a count of the number of packets that could not be transmitted due to errors.
+
+"rx_unicast" is equivalent to ifInUcastPkts defined in RFC 2863. This
+statistic is a count of the number of packets received that were not addressed
+to the broadcast address or a multicast group.
+
+"rx_multicast" is equivalent to ifInMulticastPkts defined in RFC 2863. This
+statistic is a count of the number of packets received that were addressed to
+a multicast address group.
+
+"rx_broadcast" is equivalent to ifInBroadcastPkts defined in RFC 2863. This
+statistic is a count of the number of packets received that were addressed to
+the broadcast address.
+
+"tx_discards" is equivalent to ifOutDiscards defined in RFC 2863. This
+statistic is the number of outbound packets not transmitted even though an
+error was not detected. An example of a reason this might occur is to free up
+internal buffer space.
+
+"tx_unicast" is equivalent to ifOutUcastPkts defined in RFC 2863. This
+statistic counts the number of packets transmitted that were not addressed to
+a multicast group or broadcast address.
+
+"tx_multicast" is equivalent to ifOutMulticastPkts defined in RFC 2863. This
+statistic counts the number of packets transmitted that were addressed to a
+multicast group.
+
+"tx_broadcast" is equivalent to ifOutBroadcastPkts defined in RFC 2863. This
+statistic counts the number of packets transmitted that were addressed to a
+broadcast address.
+
+"ether_drops" is equivalent to etherStatsDropEvents defined in RFC 2819.
+This statistic counts the number of packets dropped due to lack of internal
+controller resources.
+
+"rx_total_bytes" is equivalent to etherStatsOctets defined in RFC 2819.
+This statistic counts the total number of bytes received by the controller,
+including error and discarded packets.
+
+"rx_total_packets" is equivalent to etherStatsPkts defined in RFC 2819.
+This statistic counts the total number of packets received by the controller,
+including error, discarded, unicast, multicast, and broadcast packets.
+
+"rx_undersize" is equivalent to etherStatsUndersizePkts defined in RFC 2819.
+This statistic counts the number of correctly formed packets received less
+than 64 bytes long.
+
+"rx_oversize" is equivalent to etherStatsOversizePkts defined in RFC 2819.
+This statistic counts the number of correctly formed packets greater than 1518
+bytes long.
+
+"rx_64_bytes" is equivalent to etherStatsPkts64Octets defined in RFC 2819.
+This statistic counts the total number of packets received that were 64 octets
+in length.
+
+"rx_65_127_bytes" is equivalent to etherStatsPkts65to127Octets defined in RFC
+2819. This statistic counts the total number of packets received that were
+between 65 and 127 octets in length inclusive.
+
+"rx_128_255_bytes" is equivalent to etherStatsPkts128to255Octets defined in
+RFC 2819. This statistic is the total number of packets received that were
+between 128 and 255 octets in length inclusive.
+
+"rx_256_511_bytes" is equivalent to etherStatsPkts256to511Octets defined in
+RFC 2819. This statistic is the total number of packets received that were
+between 256 and 511 octets in length inclusive.
+
+"rx_512_1023_bytes" is equivalent to etherStatsPkts512to1023Octets defined in
+RFC 2819. This statistic is the total number of packets received that were
+between 512 and 1023 octets in length inclusive.
+
+"rx_1024_1518_bytes" is equivalent to etherStatsPkts1024to1518Octets define
+in RFC 2819. This statistic is the total number of packets received that were
+between 1024 and 1518 octets in length inclusive.
+
+"rx_gte_1519_bytes" is a statistic defined specific to the behavior of the
+Altera TSE. This statistics counts the number of received good and errored
+frames between the length of 1519 and the maximum frame length configured
+in the frm_length register. See the Altera TSE User Guide for More details.
+
+"rx_jabbers" is equivalent to etherStatsJabbers defined in RFC 2819. This
+statistic is the total number of packets received that were longer than 1518
+octets, and had either a bad CRC with an integral number of octets (CRC Error)
+or a bad CRC with a non-integral number of octets (Alignment Error).
+
+"rx_runts" is equivalent to etherStatsFragments defined in RFC 2819. This
+statistic is the total number of packets received that were less than 64 octets
+in length and had either a bad CRC with an integral number of octets (CRC
+error) or a bad CRC with a non-integral number of octets (Alignment Error).
index 4ebbd659256fbfe3cdf450bfed814066ae2df575..43d3549366a09d6cfbb7e4b284bb01d04fd314a0 100644 (file)
@@ -36,54 +36,6 @@ Default Value: 0
 This parameter adds support for SR-IOV.  It causes the driver to spawn up to
 max_vfs worth of virtual function.
 
-QueuePairs
-----------
-Valid Range:  0-1
-Default Value:  1 (TX and RX will be paired onto one interrupt vector)
-
-If set to 0, when MSI-X is enabled, the TX and RX will attempt to occupy
-separate vectors.
-
-This option can be overridden to 1 if there are not sufficient interrupts
-available.  This can occur if any combination of RSS, VMDQ, and max_vfs
-results in more than 4 queues being used.
-
-Node
-----
-Valid Range:   0-n
-Default Value: -1 (off)
-
-  0 - n: where n is the number of the NUMA node that should be used to
-         allocate memory for this adapter port.
-  -1: uses the driver default of allocating memory on whichever processor is
-      running insmod/modprobe.
-
-  The Node parameter will allow you to pick which NUMA node you want to have
-  the adapter allocate memory from.  All driver structures, in-memory queues,
-  and receive buffers will be allocated on the node specified.  This parameter
-  is only useful when interrupt affinity is specified, otherwise some portion
-  of the time the interrupt could run on a different core than the memory is
-  allocated on, causing slower memory access and impacting throughput, CPU, or
-  both.
-
-EEE
----
-Valid Range:  0-1
-Default Value: 1 (enabled)
-
-  A link between two EEE-compliant devices will result in periodic bursts of
-  data followed by long periods where in the link is in an idle state. This Low
-  Power Idle (LPI) state is supported in both 1Gbps and 100Mbps link speeds.
-  NOTE: EEE support requires autonegotiation.
-
-DMAC
-----
-Valid Range: 0-1
-Default Value: 1 (enabled)
-  Enables or disables DMA Coalescing feature.
-
-
-
 Additional Configurations
 =========================
 
index 048c92b487f6a50b552cf12d47abe4212ba66f36..bc35541249032c4d64c8888741eddc64f87e4f2e 100644 (file)
@@ -202,6 +202,9 @@ Time stamps for outgoing packets are to be generated as follows:
   and not free the skb. A driver not supporting hardware time stamping doesn't
   do that. A driver must never touch sk_buff::tstamp! It is used to store
   software generated time stamps by the network subsystem.
+- Driver should call skb_tx_timestamp() as close to passing sk_buff to hardware
+  as possible. skb_tx_timestamp() provides a software time stamp if requested
+  and hardware timestamping is not possible (SKBTX_IN_PROGRESS not set).
 - As soon as the driver has sent the packet and/or obtained a
   hardware time stamp for it, it passes the time stamp back by
   calling skb_hwtstamp_tx() with the original skb, the raw
@@ -212,6 +215,3 @@ Time stamps for outgoing packets are to be generated as follows:
   this would occur at a later time in the processing pipeline than other
   software time stamping and therefore could lead to unexpected deltas
   between time stamps.
-- If the driver did not set the SKBTX_IN_PROGRESS flag (see above), then
-  dev_hard_start_xmit() checks whether software time stamping
-  is wanted as fallback and potentially generates the time stamp.
index 0dc5e0c54adf5a5f091c5984f74c1217a56a0b18..9109eab722f5d5d5fa92bee20bed666d631c35ad 100644 (file)
@@ -536,6 +536,13 @@ S: Odd Fixes
 L:     linux-alpha@vger.kernel.org
 F:     arch/alpha/
 
+ALTERA TRIPLE SPEED ETHERNET DRIVER
+M:     Vince Bridgers <vbridgers2013@gmail.com
+L:     netdev@vger.kernel.org
+L:     nios2-dev@lists.rocketboards.org (moderated for non-subscribers)
+S:     Maintained
+F:     drivers/net/ethernet/altera/
+
 ALTERA UART/JTAG UART SERIAL DRIVERS
 M:     Tobias Klauser <tklauser@distanz.ch>
 L:     linux-serial@vger.kernel.org
index 494b888a65681bde29911f13bea5ab02ed1c32d2..89402c3b64f8406ebc0e81a7d79fb341fa57df3d 100644 (file)
@@ -177,11 +177,6 @@ config NETCONSOLE_DYNAMIC
 config NETPOLL
        def_bool NETCONSOLE
 
-config NETPOLL_TRAP
-       bool "Netpoll traffic trapping"
-       default n
-       depends on NETPOLL
-
 config NET_POLL_CONTROLLER
        def_bool NETPOLL
 
index dee2a84a2929e1380ac679e3785323781824faf2..b667a51ed21517a3ee6cf2be6ab4c7e306a713a2 100644 (file)
@@ -1284,11 +1284,11 @@ static void ad_port_selection_logic(struct port *port)
                        /* meaning: the port was related to an aggregator
                         * but was not on the aggregator port list
                         */
-                       pr_warn("%s: Warning: Port %d (on %s) was related to aggregator %d but was not on its port list\n",
-                               port->slave->bond->dev->name,
-                               port->actor_port_number,
-                               port->slave->dev->name,
-                               port->aggregator->aggregator_identifier);
+                       pr_warn_ratelimited("%s: Warning: Port %d (on %s) was related to aggregator %d but was not on its port list\n",
+                                           port->slave->bond->dev->name,
+                                           port->actor_port_number,
+                                           port->slave->dev->name,
+                                           port->aggregator->aggregator_identifier);
                }
        }
        /* search on all aggregators for a suitable aggregator for this port */
@@ -1445,9 +1445,9 @@ static struct aggregator *ad_agg_selection_test(struct aggregator *best,
                break;
 
        default:
-               pr_warn("%s: Impossible agg select mode %d\n",
-                       curr->slave->bond->dev->name,
-                       __get_agg_selection_mode(curr->lag_ports));
+               pr_warn_ratelimited("%s: Impossible agg select mode %d\n",
+                                   curr->slave->bond->dev->name,
+                                   __get_agg_selection_mode(curr->lag_ports));
                break;
        }
 
@@ -1560,9 +1560,9 @@ static void ad_agg_selection_logic(struct aggregator *agg)
 
                /* check if any partner replys */
                if (best->is_individual) {
-                       pr_warn("%s: Warning: No 802.3ad response from the link partner for any adapters in the bond\n",
-                               best->slave ?
-                               best->slave->bond->dev->name : "NULL");
+                       pr_warn_ratelimited("%s: Warning: No 802.3ad response from the link partner for any adapters in the bond\n",
+                                           best->slave ?
+                                           best->slave->bond->dev->name : "NULL");
                }
 
                best->is_active = 1;
@@ -2081,8 +2081,8 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
                /* select the active aggregator for the bond */
                if (port) {
                        if (!port->slave) {
-                               pr_warn("%s: Warning: bond's first port is uninitialized\n",
-                                       bond->dev->name);
+                               pr_warn_ratelimited("%s: Warning: bond's first port is uninitialized\n",
+                                                   bond->dev->name);
                                goto re_arm;
                        }
 
@@ -2096,8 +2096,8 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
        bond_for_each_slave_rcu(bond, slave, iter) {
                port = &(SLAVE_AD_INFO(slave).port);
                if (!port->slave) {
-                       pr_warn("%s: Warning: Found an uninitialized port\n",
-                               bond->dev->name);
+                       pr_warn_ratelimited("%s: Warning: Found an uninitialized port\n",
+                                           bond->dev->name);
                        goto re_arm;
                }
 
@@ -2158,8 +2158,8 @@ static int bond_3ad_rx_indication(struct lacpdu *lacpdu, struct slave *slave,
                port = &(SLAVE_AD_INFO(slave).port);
 
                if (!port->slave) {
-                       pr_warn("%s: Warning: port of slave %s is uninitialized\n",
-                               slave->dev->name, slave->bond->dev->name);
+                       pr_warn_ratelimited("%s: Warning: port of slave %s is uninitialized\n",
+                                           slave->dev->name, slave->bond->dev->name);
                        return ret;
                }
 
index 506b0248c4001b48382f3ef6be2ccbc47724c8fa..39484b534f5e7cf20118998d888b4f168b3765ac 100644 (file)
@@ -22,6 +22,7 @@ source "drivers/net/ethernet/adaptec/Kconfig"
 source "drivers/net/ethernet/aeroflex/Kconfig"
 source "drivers/net/ethernet/allwinner/Kconfig"
 source "drivers/net/ethernet/alteon/Kconfig"
+source "drivers/net/ethernet/altera/Kconfig"
 source "drivers/net/ethernet/amd/Kconfig"
 source "drivers/net/ethernet/apple/Kconfig"
 source "drivers/net/ethernet/arc/Kconfig"
index c0b8789952e711fb77e44fc214d06ee8cac5e8f3..adf61af507f7487ef4221f5fa121885a52f19f69 100644 (file)
@@ -8,6 +8,7 @@ obj-$(CONFIG_NET_VENDOR_ADAPTEC) += adaptec/
 obj-$(CONFIG_GRETH) += aeroflex/
 obj-$(CONFIG_NET_VENDOR_ALLWINNER) += allwinner/
 obj-$(CONFIG_NET_VENDOR_ALTEON) += alteon/
+obj-$(CONFIG_ALTERA_TSE) += altera/
 obj-$(CONFIG_NET_VENDOR_AMD) += amd/
 obj-$(CONFIG_NET_VENDOR_APPLE) += apple/
 obj-$(CONFIG_NET_VENDOR_ARC) += arc/
diff --git a/drivers/net/ethernet/altera/Kconfig b/drivers/net/ethernet/altera/Kconfig
new file mode 100644 (file)
index 0000000..80c1ab7
--- /dev/null
@@ -0,0 +1,8 @@
+config ALTERA_TSE
+       tristate "Altera Triple-Speed Ethernet MAC support"
+       select PHYLIB
+       ---help---
+         This driver supports the Altera Triple-Speed (TSE) Ethernet MAC.
+
+         To compile this driver as a module, choose M here. The module
+         will be called alteratse.
diff --git a/drivers/net/ethernet/altera/Makefile b/drivers/net/ethernet/altera/Makefile
new file mode 100644 (file)
index 0000000..d4a187e
--- /dev/null
@@ -0,0 +1,7 @@
+#
+# Makefile for the Altera device drivers.
+#
+
+obj-$(CONFIG_ALTERA_TSE) += altera_tse.o
+altera_tse-objs := altera_tse_main.o altera_tse_ethtool.o \
+altera_msgdma.o altera_sgdma.o altera_utils.o
diff --git a/drivers/net/ethernet/altera/altera_msgdma.c b/drivers/net/ethernet/altera/altera_msgdma.c
new file mode 100644 (file)
index 0000000..3df1866
--- /dev/null
@@ -0,0 +1,202 @@
+/* Altera TSE SGDMA and MSGDMA Linux driver
+ * Copyright (C) 2014 Altera Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/netdevice.h>
+#include "altera_utils.h"
+#include "altera_tse.h"
+#include "altera_msgdmahw.h"
+
+/* No initialization work to do for MSGDMA */
+int msgdma_initialize(struct altera_tse_private *priv)
+{
+       return 0;
+}
+
+void msgdma_uninitialize(struct altera_tse_private *priv)
+{
+}
+
+void msgdma_reset(struct altera_tse_private *priv)
+{
+       int counter;
+       struct msgdma_csr *txcsr =
+               (struct msgdma_csr *)priv->tx_dma_csr;
+       struct msgdma_csr *rxcsr =
+               (struct msgdma_csr *)priv->rx_dma_csr;
+
+       /* Reset Rx mSGDMA */
+       iowrite32(MSGDMA_CSR_STAT_MASK, &rxcsr->status);
+       iowrite32(MSGDMA_CSR_CTL_RESET, &rxcsr->control);
+
+       counter = 0;
+       while (counter++ < ALTERA_TSE_SW_RESET_WATCHDOG_CNTR) {
+               if (tse_bit_is_clear(&rxcsr->status,
+                                    MSGDMA_CSR_STAT_RESETTING))
+                       break;
+               udelay(1);
+       }
+
+       if (counter >= ALTERA_TSE_SW_RESET_WATCHDOG_CNTR)
+               netif_warn(priv, drv, priv->dev,
+                          "TSE Rx mSGDMA resetting bit never cleared!\n");
+
+       /* clear all status bits */
+       iowrite32(MSGDMA_CSR_STAT_MASK, &rxcsr->status);
+
+       /* Reset Tx mSGDMA */
+       iowrite32(MSGDMA_CSR_STAT_MASK, &txcsr->status);
+       iowrite32(MSGDMA_CSR_CTL_RESET, &txcsr->control);
+
+       counter = 0;
+       while (counter++ < ALTERA_TSE_SW_RESET_WATCHDOG_CNTR) {
+               if (tse_bit_is_clear(&txcsr->status,
+                                    MSGDMA_CSR_STAT_RESETTING))
+                       break;
+               udelay(1);
+       }
+
+       if (counter >= ALTERA_TSE_SW_RESET_WATCHDOG_CNTR)
+               netif_warn(priv, drv, priv->dev,
+                          "TSE Tx mSGDMA resetting bit never cleared!\n");
+
+       /* clear all status bits */
+       iowrite32(MSGDMA_CSR_STAT_MASK, &txcsr->status);
+}
+
+void msgdma_disable_rxirq(struct altera_tse_private *priv)
+{
+       struct msgdma_csr *csr = priv->rx_dma_csr;
+       tse_clear_bit(&csr->control, MSGDMA_CSR_CTL_GLOBAL_INTR);
+}
+
+void msgdma_enable_rxirq(struct altera_tse_private *priv)
+{
+       struct msgdma_csr *csr = priv->rx_dma_csr;
+       tse_set_bit(&csr->control, MSGDMA_CSR_CTL_GLOBAL_INTR);
+}
+
+void msgdma_disable_txirq(struct altera_tse_private *priv)
+{
+       struct msgdma_csr *csr = priv->tx_dma_csr;
+       tse_clear_bit(&csr->control, MSGDMA_CSR_CTL_GLOBAL_INTR);
+}
+
+void msgdma_enable_txirq(struct altera_tse_private *priv)
+{
+       struct msgdma_csr *csr = priv->tx_dma_csr;
+       tse_set_bit(&csr->control, MSGDMA_CSR_CTL_GLOBAL_INTR);
+}
+
+void msgdma_clear_rxirq(struct altera_tse_private *priv)
+{
+       struct msgdma_csr *csr = priv->rx_dma_csr;
+       iowrite32(MSGDMA_CSR_STAT_IRQ, &csr->status);
+}
+
+void msgdma_clear_txirq(struct altera_tse_private *priv)
+{
+       struct msgdma_csr *csr = priv->tx_dma_csr;
+       iowrite32(MSGDMA_CSR_STAT_IRQ, &csr->status);
+}
+
+/* return 0 to indicate transmit is pending */
+int msgdma_tx_buffer(struct altera_tse_private *priv, struct tse_buffer *buffer)
+{
+       struct msgdma_extended_desc *desc = priv->tx_dma_desc;
+
+       iowrite32(lower_32_bits(buffer->dma_addr), &desc->read_addr_lo);
+       iowrite32(upper_32_bits(buffer->dma_addr), &desc->read_addr_hi);
+       iowrite32(0, &desc->write_addr_lo);
+       iowrite32(0, &desc->write_addr_hi);
+       iowrite32(buffer->len, &desc->len);
+       iowrite32(0, &desc->burst_seq_num);
+       iowrite32(MSGDMA_DESC_TX_STRIDE, &desc->stride);
+       iowrite32(MSGDMA_DESC_CTL_TX_SINGLE, &desc->control);
+       return 0;
+}
+
+u32 msgdma_tx_completions(struct altera_tse_private *priv)
+{
+       u32 ready = 0;
+       u32 inuse;
+       u32 status;
+       struct msgdma_csr *txcsr =
+               (struct msgdma_csr *)priv->tx_dma_csr;
+
+       /* Get number of sent descriptors */
+       inuse = ioread32(&txcsr->rw_fill_level) & 0xffff;
+
+       if (inuse) { /* Tx FIFO is not empty */
+               ready = priv->tx_prod - priv->tx_cons - inuse - 1;
+       } else {
+               /* Check for buffered last packet */
+               status = ioread32(&txcsr->status);
+               if (status & MSGDMA_CSR_STAT_BUSY)
+                       ready = priv->tx_prod - priv->tx_cons - 1;
+               else
+                       ready = priv->tx_prod - priv->tx_cons;
+       }
+       return ready;
+}
+
+/* Put buffer to the mSGDMA RX FIFO
+ */
+int msgdma_add_rx_desc(struct altera_tse_private *priv,
+                       struct tse_buffer *rxbuffer)
+{
+       struct msgdma_extended_desc *desc = priv->rx_dma_desc;
+       u32 len = priv->rx_dma_buf_sz;
+       dma_addr_t dma_addr = rxbuffer->dma_addr;
+       u32 control = (MSGDMA_DESC_CTL_END_ON_EOP
+                       | MSGDMA_DESC_CTL_END_ON_LEN
+                       | MSGDMA_DESC_CTL_TR_COMP_IRQ
+                       | MSGDMA_DESC_CTL_EARLY_IRQ
+                       | MSGDMA_DESC_CTL_TR_ERR_IRQ
+                       | MSGDMA_DESC_CTL_GO);
+
+       iowrite32(0, &desc->read_addr_lo);
+       iowrite32(0, &desc->read_addr_hi);
+       iowrite32(lower_32_bits(dma_addr), &desc->write_addr_lo);
+       iowrite32(upper_32_bits(dma_addr), &desc->write_addr_hi);
+       iowrite32(len, &desc->len);
+       iowrite32(0, &desc->burst_seq_num);
+       iowrite32(0x00010001, &desc->stride);
+       iowrite32(control, &desc->control);
+       return 1;
+}
+
+/* status is returned on upper 16 bits,
+ * length is returned in lower 16 bits
+ */
+u32 msgdma_rx_status(struct altera_tse_private *priv)
+{
+       u32 rxstatus = 0;
+       u32 pktlength;
+       u32 pktstatus;
+       struct msgdma_csr *rxcsr =
+               (struct msgdma_csr *)priv->rx_dma_csr;
+       struct msgdma_response *rxresp =
+               (struct msgdma_response *)priv->rx_dma_resp;
+
+       if (ioread32(&rxcsr->resp_fill_level) & 0xffff) {
+               pktlength = ioread32(&rxresp->bytes_transferred);
+               pktstatus = ioread32(&rxresp->status);
+               rxstatus = pktstatus;
+               rxstatus = rxstatus << 16;
+               rxstatus |= (pktlength & 0xffff);
+       }
+       return rxstatus;
+}
diff --git a/drivers/net/ethernet/altera/altera_msgdma.h b/drivers/net/ethernet/altera/altera_msgdma.h
new file mode 100644 (file)
index 0000000..7f0f5bf
--- /dev/null
@@ -0,0 +1,34 @@
+/* Altera TSE SGDMA and MSGDMA Linux driver
+ * Copyright (C) 2014 Altera Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ALTERA_MSGDMA_H__
+#define __ALTERA_MSGDMA_H__
+
+void msgdma_reset(struct altera_tse_private *);
+void msgdma_enable_txirq(struct altera_tse_private *);
+void msgdma_enable_rxirq(struct altera_tse_private *);
+void msgdma_disable_rxirq(struct altera_tse_private *);
+void msgdma_disable_txirq(struct altera_tse_private *);
+void msgdma_clear_rxirq(struct altera_tse_private *);
+void msgdma_clear_txirq(struct altera_tse_private *);
+u32 msgdma_tx_completions(struct altera_tse_private *);
+int msgdma_add_rx_desc(struct altera_tse_private *, struct tse_buffer *);
+int msgdma_tx_buffer(struct altera_tse_private *, struct tse_buffer *);
+u32 msgdma_rx_status(struct altera_tse_private *);
+int msgdma_initialize(struct altera_tse_private *);
+void msgdma_uninitialize(struct altera_tse_private *);
+
+#endif /*  __ALTERA_MSGDMA_H__ */
diff --git a/drivers/net/ethernet/altera/altera_msgdmahw.h b/drivers/net/ethernet/altera/altera_msgdmahw.h
new file mode 100644 (file)
index 0000000..d7b59ba
--- /dev/null
@@ -0,0 +1,167 @@
+/* Altera TSE SGDMA and MSGDMA Linux driver
+ * Copyright (C) 2014 Altera Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ALTERA_MSGDMAHW_H__
+#define __ALTERA_MSGDMAHW_H__
+
+/* mSGDMA standard descriptor format
+ */
+struct msgdma_desc {
+       u32 read_addr;  /* data buffer source address */
+       u32 write_addr; /* data buffer destination address */
+       u32 len;        /* the number of bytes to transfer per descriptor */
+       u32 control;    /* characteristics of the transfer */
+};
+
+/* mSGDMA extended descriptor format
+ */
+struct msgdma_extended_desc {
+       u32 read_addr_lo;       /* data buffer source address low bits */
+       u32 write_addr_lo;      /* data buffer destination address low bits */
+       u32 len;                /* the number of bytes to transfer
+                                * per descriptor
+                                */
+       u32 burst_seq_num;      /* bit 31:24 write burst
+                                * bit 23:16 read burst
+                                * bit 15:0  sequence number
+                                */
+       u32 stride;             /* bit 31:16 write stride
+                                * bit 15:0  read stride
+                                */
+       u32 read_addr_hi;       /* data buffer source address high bits */
+       u32 write_addr_hi;      /* data buffer destination address high bits */
+       u32 control;            /* characteristics of the transfer */
+};
+
+/* mSGDMA descriptor control field bit definitions
+ */
+#define MSGDMA_DESC_CTL_SET_CH(x)      ((x) & 0xff)
+#define MSGDMA_DESC_CTL_GEN_SOP                BIT(8)
+#define MSGDMA_DESC_CTL_GEN_EOP                BIT(9)
+#define MSGDMA_DESC_CTL_PARK_READS     BIT(10)
+#define MSGDMA_DESC_CTL_PARK_WRITES    BIT(11)
+#define MSGDMA_DESC_CTL_END_ON_EOP     BIT(12)
+#define MSGDMA_DESC_CTL_END_ON_LEN     BIT(13)
+#define MSGDMA_DESC_CTL_TR_COMP_IRQ    BIT(14)
+#define MSGDMA_DESC_CTL_EARLY_IRQ      BIT(15)
+#define MSGDMA_DESC_CTL_TR_ERR_IRQ     (0xff << 16)
+#define MSGDMA_DESC_CTL_EARLY_DONE     BIT(24)
+/* Writing â€˜1’ to the â€˜go’ bit commits the entire descriptor into the
+ * descriptor FIFO(s)
+ */
+#define MSGDMA_DESC_CTL_GO             BIT(31)
+
+/* Tx buffer control flags
+ */
+#define MSGDMA_DESC_CTL_TX_FIRST       (MSGDMA_DESC_CTL_GEN_SOP |      \
+                                        MSGDMA_DESC_CTL_TR_ERR_IRQ |   \
+                                        MSGDMA_DESC_CTL_GO)
+
+#define MSGDMA_DESC_CTL_TX_MIDDLE      (MSGDMA_DESC_CTL_TR_ERR_IRQ |   \
+                                        MSGDMA_DESC_CTL_GO)
+
+#define MSGDMA_DESC_CTL_TX_LAST                (MSGDMA_DESC_CTL_GEN_EOP |      \
+                                        MSGDMA_DESC_CTL_TR_COMP_IRQ |  \
+                                        MSGDMA_DESC_CTL_TR_ERR_IRQ |   \
+                                        MSGDMA_DESC_CTL_GO)
+
+#define MSGDMA_DESC_CTL_TX_SINGLE      (MSGDMA_DESC_CTL_GEN_SOP |      \
+                                        MSGDMA_DESC_CTL_GEN_EOP |      \
+                                        MSGDMA_DESC_CTL_TR_COMP_IRQ |  \
+                                        MSGDMA_DESC_CTL_TR_ERR_IRQ |   \
+                                        MSGDMA_DESC_CTL_GO)
+
+#define MSGDMA_DESC_CTL_RX_SINGLE      (MSGDMA_DESC_CTL_END_ON_EOP |   \
+                                        MSGDMA_DESC_CTL_END_ON_LEN |   \
+                                        MSGDMA_DESC_CTL_TR_COMP_IRQ |  \
+                                        MSGDMA_DESC_CTL_EARLY_IRQ |    \
+                                        MSGDMA_DESC_CTL_TR_ERR_IRQ |   \
+                                        MSGDMA_DESC_CTL_GO)
+
+/* mSGDMA extended descriptor stride definitions
+ */
+#define MSGDMA_DESC_TX_STRIDE          (0x00010001)
+#define MSGDMA_DESC_RX_STRIDE          (0x00010001)
+
+/* mSGDMA dispatcher control and status register map
+ */
+struct msgdma_csr {
+       u32 status;             /* Read/Clear */
+       u32 control;            /* Read/Write */
+       u32 rw_fill_level;      /* bit 31:16 - write fill level
+                                * bit 15:0  - read fill level
+                                */
+       u32 resp_fill_level;    /* bit 15:0 */
+       u32 rw_seq_num;         /* bit 31:16 - write sequence number
+                                * bit 15:0  - read sequence number
+                                */
+       u32 pad[3];             /* reserved */
+};
+
+/* mSGDMA CSR status register bit definitions
+ */
+#define MSGDMA_CSR_STAT_BUSY                   BIT(0)
+#define MSGDMA_CSR_STAT_DESC_BUF_EMPTY         BIT(1)
+#define MSGDMA_CSR_STAT_DESC_BUF_FULL          BIT(2)
+#define MSGDMA_CSR_STAT_RESP_BUF_EMPTY         BIT(3)
+#define MSGDMA_CSR_STAT_RESP_BUF_FULL          BIT(4)
+#define MSGDMA_CSR_STAT_STOPPED                        BIT(5)
+#define MSGDMA_CSR_STAT_RESETTING              BIT(6)
+#define MSGDMA_CSR_STAT_STOPPED_ON_ERR         BIT(7)
+#define MSGDMA_CSR_STAT_STOPPED_ON_EARLY       BIT(8)
+#define MSGDMA_CSR_STAT_IRQ                    BIT(9)
+#define MSGDMA_CSR_STAT_MASK                   0x3FF
+#define MSGDMA_CSR_STAT_MASK_WITHOUT_IRQ       0x1FF
+
+#define MSGDMA_CSR_STAT_BUSY_GET(v)                    GET_BIT_VALUE(v, 0)
+#define MSGDMA_CSR_STAT_DESC_BUF_EMPTY_GET(v)          GET_BIT_VALUE(v, 1)
+#define MSGDMA_CSR_STAT_DESC_BUF_FULL_GET(v)           GET_BIT_VALUE(v, 2)
+#define MSGDMA_CSR_STAT_RESP_BUF_EMPTY_GET(v)          GET_BIT_VALUE(v, 3)
+#define MSGDMA_CSR_STAT_RESP_BUF_FULL_GET(v)           GET_BIT_VALUE(v, 4)
+#define MSGDMA_CSR_STAT_STOPPED_GET(v)                 GET_BIT_VALUE(v, 5)
+#define MSGDMA_CSR_STAT_RESETTING_GET(v)               GET_BIT_VALUE(v, 6)
+#define MSGDMA_CSR_STAT_STOPPED_ON_ERR_GET(v)          GET_BIT_VALUE(v, 7)
+#define MSGDMA_CSR_STAT_STOPPED_ON_EARLY_GET(v)                GET_BIT_VALUE(v, 8)
+#define MSGDMA_CSR_STAT_IRQ_GET(v)                     GET_BIT_VALUE(v, 9)
+
+/* mSGDMA CSR control register bit definitions
+ */
+#define MSGDMA_CSR_CTL_STOP                    BIT(0)
+#define MSGDMA_CSR_CTL_RESET                   BIT(1)
+#define MSGDMA_CSR_CTL_STOP_ON_ERR             BIT(2)
+#define MSGDMA_CSR_CTL_STOP_ON_EARLY           BIT(3)
+#define MSGDMA_CSR_CTL_GLOBAL_INTR             BIT(4)
+#define MSGDMA_CSR_CTL_STOP_DESCS              BIT(5)
+
+/* mSGDMA CSR fill level bits
+ */
+#define MSGDMA_CSR_WR_FILL_LEVEL_GET(v)                (((v) & 0xffff0000) >> 16)
+#define MSGDMA_CSR_RD_FILL_LEVEL_GET(v)                ((v) & 0x0000ffff)
+#define MSGDMA_CSR_RESP_FILL_LEVEL_GET(v)      ((v) & 0x0000ffff)
+
+/* mSGDMA response register map
+ */
+struct msgdma_response {
+       u32 bytes_transferred;
+       u32 status;
+};
+
+/* mSGDMA response register bit definitions
+ */
+#define MSGDMA_RESP_EARLY_TERM BIT(8)
+#define MSGDMA_RESP_ERR_MASK   0xFF
+
+#endif /* __ALTERA_MSGDMA_H__*/
diff --git a/drivers/net/ethernet/altera/altera_sgdma.c b/drivers/net/ethernet/altera/altera_sgdma.c
new file mode 100644 (file)
index 0000000..cbeee07
--- /dev/null
@@ -0,0 +1,511 @@
+/* Altera TSE SGDMA and MSGDMA Linux driver
+ * Copyright (C) 2014 Altera Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/list.h>
+#include "altera_utils.h"
+#include "altera_tse.h"
+#include "altera_sgdmahw.h"
+#include "altera_sgdma.h"
+
+static void sgdma_descrip(struct sgdma_descrip *desc,
+                         struct sgdma_descrip *ndesc,
+                         dma_addr_t ndesc_phys,
+                         dma_addr_t raddr,
+                         dma_addr_t waddr,
+                         u16 length,
+                         int generate_eop,
+                         int rfixed,
+                         int wfixed);
+
+static int sgdma_async_write(struct altera_tse_private *priv,
+                             struct sgdma_descrip *desc);
+
+static int sgdma_async_read(struct altera_tse_private *priv);
+
+static dma_addr_t
+sgdma_txphysaddr(struct altera_tse_private *priv,
+                struct sgdma_descrip *desc);
+
+static dma_addr_t
+sgdma_rxphysaddr(struct altera_tse_private *priv,
+                struct sgdma_descrip *desc);
+
+static int sgdma_txbusy(struct altera_tse_private *priv);
+
+static int sgdma_rxbusy(struct altera_tse_private *priv);
+
+static void
+queue_tx(struct altera_tse_private *priv, struct tse_buffer *buffer);
+
+static void
+queue_rx(struct altera_tse_private *priv, struct tse_buffer *buffer);
+
+static struct tse_buffer *
+dequeue_tx(struct altera_tse_private *priv);
+
+static struct tse_buffer *
+dequeue_rx(struct altera_tse_private *priv);
+
+static struct tse_buffer *
+queue_rx_peekhead(struct altera_tse_private *priv);
+
+int sgdma_initialize(struct altera_tse_private *priv)
+{
+       priv->txctrlreg = SGDMA_CTRLREG_ILASTD;
+
+       priv->rxctrlreg = SGDMA_CTRLREG_IDESCRIP |
+                     SGDMA_CTRLREG_ILASTD;
+
+       INIT_LIST_HEAD(&priv->txlisthd);
+       INIT_LIST_HEAD(&priv->rxlisthd);
+
+       priv->rxdescphys = (dma_addr_t) 0;
+       priv->txdescphys = (dma_addr_t) 0;
+
+       priv->rxdescphys = dma_map_single(priv->device, priv->rx_dma_desc,
+                                         priv->rxdescmem, DMA_BIDIRECTIONAL);
+
+       if (dma_mapping_error(priv->device, priv->rxdescphys)) {
+               sgdma_uninitialize(priv);
+               netdev_err(priv->dev, "error mapping rx descriptor memory\n");
+               return -EINVAL;
+       }
+
+       priv->txdescphys = dma_map_single(priv->device, priv->rx_dma_desc,
+                                         priv->rxdescmem, DMA_TO_DEVICE);
+
+       if (dma_mapping_error(priv->device, priv->txdescphys)) {
+               sgdma_uninitialize(priv);
+               netdev_err(priv->dev, "error mapping tx descriptor memory\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+void sgdma_uninitialize(struct altera_tse_private *priv)
+{
+       if (priv->rxdescphys)
+               dma_unmap_single(priv->device, priv->rxdescphys,
+                                priv->rxdescmem, DMA_BIDIRECTIONAL);
+
+       if (priv->txdescphys)
+               dma_unmap_single(priv->device, priv->txdescphys,
+                                priv->txdescmem, DMA_TO_DEVICE);
+}
+
+/* This function resets the SGDMA controller and clears the
+ * descriptor memory used for transmits and receives.
+ */
+void sgdma_reset(struct altera_tse_private *priv)
+{
+       u32 *ptxdescripmem = (u32 *)priv->tx_dma_desc;
+       u32 txdescriplen   = priv->txdescmem;
+       u32 *prxdescripmem = (u32 *)priv->rx_dma_desc;
+       u32 rxdescriplen   = priv->rxdescmem;
+       struct sgdma_csr *ptxsgdma = (struct sgdma_csr *)priv->tx_dma_csr;
+       struct sgdma_csr *prxsgdma = (struct sgdma_csr *)priv->rx_dma_csr;
+
+       /* Initialize descriptor memory to 0 */
+       memset(ptxdescripmem, 0, txdescriplen);
+       memset(prxdescripmem, 0, rxdescriplen);
+
+       iowrite32(SGDMA_CTRLREG_RESET, &ptxsgdma->control);
+       iowrite32(0, &ptxsgdma->control);
+
+       iowrite32(SGDMA_CTRLREG_RESET, &prxsgdma->control);
+       iowrite32(0, &prxsgdma->control);
+}
+
+void sgdma_enable_rxirq(struct altera_tse_private *priv)
+{
+       struct sgdma_csr *csr = (struct sgdma_csr *)priv->rx_dma_csr;
+       priv->rxctrlreg |= SGDMA_CTRLREG_INTEN;
+       tse_set_bit(&csr->control, SGDMA_CTRLREG_INTEN);
+}
+
+void sgdma_enable_txirq(struct altera_tse_private *priv)
+{
+       struct sgdma_csr *csr = (struct sgdma_csr *)priv->tx_dma_csr;
+       priv->txctrlreg |= SGDMA_CTRLREG_INTEN;
+       tse_set_bit(&csr->control, SGDMA_CTRLREG_INTEN);
+}
+
+/* for SGDMA, RX interrupts remain enabled after enabling */
+void sgdma_disable_rxirq(struct altera_tse_private *priv)
+{
+}
+
+/* for SGDMA, TX interrupts remain enabled after enabling */
+void sgdma_disable_txirq(struct altera_tse_private *priv)
+{
+}
+
+void sgdma_clear_rxirq(struct altera_tse_private *priv)
+{
+       struct sgdma_csr *csr = (struct sgdma_csr *)priv->rx_dma_csr;
+       tse_set_bit(&csr->control, SGDMA_CTRLREG_CLRINT);
+}
+
+void sgdma_clear_txirq(struct altera_tse_private *priv)
+{
+       struct sgdma_csr *csr = (struct sgdma_csr *)priv->tx_dma_csr;
+       tse_set_bit(&csr->control, SGDMA_CTRLREG_CLRINT);
+}
+
+/* transmits buffer through SGDMA. Returns number of buffers
+ * transmitted, 0 if not possible.
+ *
+ * tx_lock is held by the caller
+ */
+int sgdma_tx_buffer(struct altera_tse_private *priv, struct tse_buffer *buffer)
+{
+       int pktstx = 0;
+       struct sgdma_descrip *descbase =
+               (struct sgdma_descrip *)priv->tx_dma_desc;
+
+       struct sgdma_descrip *cdesc = &descbase[0];
+       struct sgdma_descrip *ndesc = &descbase[1];
+
+       /* wait 'til the tx sgdma is ready for the next transmit request */
+       if (sgdma_txbusy(priv))
+               return 0;
+
+       sgdma_descrip(cdesc,                    /* current descriptor */
+                     ndesc,                    /* next descriptor */
+                     sgdma_txphysaddr(priv, ndesc),
+                     buffer->dma_addr,         /* address of packet to xmit */
+                     0,                        /* write addr 0 for tx dma */
+                     buffer->len,              /* length of packet */
+                     SGDMA_CONTROL_EOP,        /* Generate EOP */
+                     0,                        /* read fixed */
+                     SGDMA_CONTROL_WR_FIXED);  /* Generate SOP */
+
+       pktstx = sgdma_async_write(priv, cdesc);
+
+       /* enqueue the request to the pending transmit queue */
+       queue_tx(priv, buffer);
+
+       return 1;
+}
+
+
+/* tx_lock held to protect access to queued tx list
+ */
+u32 sgdma_tx_completions(struct altera_tse_private *priv)
+{
+       u32 ready = 0;
+       struct sgdma_descrip *desc = (struct sgdma_descrip *)priv->tx_dma_desc;
+
+       if (!sgdma_txbusy(priv) &&
+           ((desc->control & SGDMA_CONTROL_HW_OWNED) == 0) &&
+           (dequeue_tx(priv))) {
+               ready = 1;
+       }
+
+       return ready;
+}
+
+int sgdma_add_rx_desc(struct altera_tse_private *priv,
+                     struct tse_buffer *rxbuffer)
+{
+       queue_rx(priv, rxbuffer);
+       return sgdma_async_read(priv);
+}
+
+/* status is returned on upper 16 bits,
+ * length is returned in lower 16 bits
+ */
+u32 sgdma_rx_status(struct altera_tse_private *priv)
+{
+       struct sgdma_csr *csr = (struct sgdma_csr *)priv->rx_dma_csr;
+       struct sgdma_descrip *base = (struct sgdma_descrip *)priv->rx_dma_desc;
+       struct sgdma_descrip *desc = NULL;
+       int pktsrx;
+       unsigned int rxstatus = 0;
+       unsigned int pktlength = 0;
+       unsigned int pktstatus = 0;
+       struct tse_buffer *rxbuffer = NULL;
+
+       dma_sync_single_for_cpu(priv->device,
+                               priv->rxdescphys,
+                               priv->rxdescmem,
+                               DMA_BIDIRECTIONAL);
+
+       desc = &base[0];
+       if ((ioread32(&csr->status) & SGDMA_STSREG_EOP) ||
+           (desc->status & SGDMA_STATUS_EOP)) {
+               pktlength = desc->bytes_xferred;
+               pktstatus = desc->status & 0x3f;
+               rxstatus = pktstatus;
+               rxstatus = rxstatus << 16;
+               rxstatus |= (pktlength & 0xffff);
+
+               desc->status = 0;
+
+               rxbuffer = dequeue_rx(priv);
+               if (rxbuffer == NULL)
+                       netdev_err(priv->dev,
+                                  "sgdma rx and rx queue empty!\n");
+
+               /* kick the rx sgdma after reaping this descriptor */
+               pktsrx = sgdma_async_read(priv);
+       }
+
+       return rxstatus;
+}
+
+
+/* Private functions */
+static void sgdma_descrip(struct sgdma_descrip *desc,
+                         struct sgdma_descrip *ndesc,
+                         dma_addr_t ndesc_phys,
+                         dma_addr_t raddr,
+                         dma_addr_t waddr,
+                         u16 length,
+                         int generate_eop,
+                         int rfixed,
+                         int wfixed)
+{
+       /* Clear the next descriptor as not owned by hardware */
+       u32 ctrl = ndesc->control;
+       ctrl &= ~SGDMA_CONTROL_HW_OWNED;
+       ndesc->control = ctrl;
+
+       ctrl = 0;
+       ctrl = SGDMA_CONTROL_HW_OWNED;
+       ctrl |= generate_eop;
+       ctrl |= rfixed;
+       ctrl |= wfixed;
+
+       /* Channel is implicitly zero, initialized to 0 by default */
+
+       desc->raddr = raddr;
+       desc->waddr = waddr;
+       desc->next = lower_32_bits(ndesc_phys);
+       desc->control = ctrl;
+       desc->status = 0;
+       desc->rburst = 0;
+       desc->wburst = 0;
+       desc->bytes = length;
+       desc->bytes_xferred = 0;
+}
+
+/* If hardware is busy, don't restart async read.
+ * if status register is 0 - meaning initial state, restart async read,
+ * probably for the first time when populating a receive buffer.
+ * If read status indicate not busy and a status, restart the async
+ * DMA read.
+ */
+static int sgdma_async_read(struct altera_tse_private *priv)
+{
+       struct sgdma_csr *csr = (struct sgdma_csr *)priv->rx_dma_csr;
+       struct sgdma_descrip *descbase =
+               (struct sgdma_descrip *)priv->rx_dma_desc;
+
+       struct sgdma_descrip *cdesc = &descbase[0];
+       struct sgdma_descrip *ndesc = &descbase[1];
+
+       unsigned int sts = ioread32(&csr->status);
+       struct tse_buffer *rxbuffer = NULL;
+
+       if (!sgdma_rxbusy(priv)) {
+               rxbuffer = queue_rx_peekhead(priv);
+               if (rxbuffer == NULL)
+                       return 0;
+
+               sgdma_descrip(cdesc,            /* current descriptor */
+                             ndesc,            /* next descriptor */
+                             sgdma_rxphysaddr(priv, ndesc),
+                             0,                /* read addr 0 for rx dma */
+                             rxbuffer->dma_addr, /* write addr for rx dma */
+                             0,                /* read 'til EOP */
+                             0,                /* EOP: NA for rx dma */
+                             0,                /* read fixed: NA for rx dma */
+                             0);               /* SOP: NA for rx DMA */
+
+               /* clear control and status */
+               iowrite32(0, &csr->control);
+
+               /* If statuc available, clear those bits */
+               if (sts & 0xf)
+                       iowrite32(0xf, &csr->status);
+
+               dma_sync_single_for_device(priv->device,
+                                          priv->rxdescphys,
+                                          priv->rxdescmem,
+                                          DMA_BIDIRECTIONAL);
+
+               iowrite32(lower_32_bits(sgdma_rxphysaddr(priv, cdesc)),
+                         &csr->next_descrip);
+
+               iowrite32((priv->rxctrlreg | SGDMA_CTRLREG_START),
+                         &csr->control);
+
+               return 1;
+       }
+
+       return 0;
+}
+
+static int sgdma_async_write(struct altera_tse_private *priv,
+                            struct sgdma_descrip *desc)
+{
+       struct sgdma_csr *csr = (struct sgdma_csr *)priv->tx_dma_csr;
+
+       if (sgdma_txbusy(priv))
+               return 0;
+
+       /* clear control and status */
+       iowrite32(0, &csr->control);
+       iowrite32(0x1f, &csr->status);
+
+       dma_sync_single_for_device(priv->device, priv->txdescphys,
+                                  priv->txdescmem, DMA_TO_DEVICE);
+
+       iowrite32(lower_32_bits(sgdma_txphysaddr(priv, desc)),
+                 &csr->next_descrip);
+
+       iowrite32((priv->txctrlreg | SGDMA_CTRLREG_START),
+                 &csr->control);
+
+       return 1;
+}
+
+static dma_addr_t
+sgdma_txphysaddr(struct altera_tse_private *priv,
+                struct sgdma_descrip *desc)
+{
+       dma_addr_t paddr = priv->txdescmem_busaddr;
+       dma_addr_t offs = (dma_addr_t)((dma_addr_t)desc -
+                               (dma_addr_t)priv->tx_dma_desc);
+       return paddr + offs;
+}
+
+static dma_addr_t
+sgdma_rxphysaddr(struct altera_tse_private *priv,
+                struct sgdma_descrip *desc)
+{
+       dma_addr_t paddr = priv->rxdescmem_busaddr;
+       dma_addr_t offs = (dma_addr_t)((dma_addr_t)desc -
+                               (dma_addr_t)priv->rx_dma_desc);
+       return paddr + offs;
+}
+
+#define list_remove_head(list, entry, type, member)                    \
+       do {                                                            \
+               entry = NULL;                                           \
+               if (!list_empty(list)) {                                \
+                       entry = list_entry((list)->next, type, member); \
+                       list_del_init(&entry->member);                  \
+               }                                                       \
+       } while (0)
+
+#define list_peek_head(list, entry, type, member)                      \
+       do {                                                            \
+               entry = NULL;                                           \
+               if (!list_empty(list)) {                                \
+                       entry = list_entry((list)->next, type, member); \
+               }                                                       \
+       } while (0)
+
+/* adds a tse_buffer to the tail of a tx buffer list.
+ * assumes the caller is managing and holding a mutual exclusion
+ * primitive to avoid simultaneous pushes/pops to the list.
+ */
+static void
+queue_tx(struct altera_tse_private *priv, struct tse_buffer *buffer)
+{
+       list_add_tail(&buffer->lh, &priv->txlisthd);
+}
+
+
+/* adds a tse_buffer to the tail of a rx buffer list
+ * assumes the caller is managing and holding a mutual exclusion
+ * primitive to avoid simultaneous pushes/pops to the list.
+ */
+static void
+queue_rx(struct altera_tse_private *priv, struct tse_buffer *buffer)
+{
+       list_add_tail(&buffer->lh, &priv->rxlisthd);
+}
+
+/* dequeues a tse_buffer from the transmit buffer list, otherwise
+ * returns NULL if empty.
+ * assumes the caller is managing and holding a mutual exclusion
+ * primitive to avoid simultaneous pushes/pops to the list.
+ */
+static struct tse_buffer *
+dequeue_tx(struct altera_tse_private *priv)
+{
+       struct tse_buffer *buffer = NULL;
+       list_remove_head(&priv->txlisthd, buffer, struct tse_buffer, lh);
+       return buffer;
+}
+
+/* dequeues a tse_buffer from the receive buffer list, otherwise
+ * returns NULL if empty
+ * assumes the caller is managing and holding a mutual exclusion
+ * primitive to avoid simultaneous pushes/pops to the list.
+ */
+static struct tse_buffer *
+dequeue_rx(struct altera_tse_private *priv)
+{
+       struct tse_buffer *buffer = NULL;
+       list_remove_head(&priv->rxlisthd, buffer, struct tse_buffer, lh);
+       return buffer;
+}
+
+/* dequeues a tse_buffer from the receive buffer list, otherwise
+ * returns NULL if empty
+ * assumes the caller is managing and holding a mutual exclusion
+ * primitive to avoid simultaneous pushes/pops to the list while the
+ * head is being examined.
+ */
+static struct tse_buffer *
+queue_rx_peekhead(struct altera_tse_private *priv)
+{
+       struct tse_buffer *buffer = NULL;
+       list_peek_head(&priv->rxlisthd, buffer, struct tse_buffer, lh);
+       return buffer;
+}
+
+/* check and return rx sgdma status without polling
+ */
+static int sgdma_rxbusy(struct altera_tse_private *priv)
+{
+       struct sgdma_csr *csr = (struct sgdma_csr *)priv->rx_dma_csr;
+       return ioread32(&csr->status) & SGDMA_STSREG_BUSY;
+}
+
+/* waits for the tx sgdma to finish it's current operation, returns 0
+ * when it transitions to nonbusy, returns 1 if the operation times out
+ */
+static int sgdma_txbusy(struct altera_tse_private *priv)
+{
+       int delay = 0;
+       struct sgdma_csr *csr = (struct sgdma_csr *)priv->tx_dma_csr;
+
+       /* if DMA is busy, wait for current transactino to finish */
+       while ((ioread32(&csr->status) & SGDMA_STSREG_BUSY) && (delay++ < 100))
+               udelay(1);
+
+       if (ioread32(&csr->status) & SGDMA_STSREG_BUSY) {
+               netdev_err(priv->dev, "timeout waiting for tx dma\n");
+               return 1;
+       }
+       return 0;
+}
diff --git a/drivers/net/ethernet/altera/altera_sgdma.h b/drivers/net/ethernet/altera/altera_sgdma.h
new file mode 100644 (file)
index 0000000..07d4717
--- /dev/null
@@ -0,0 +1,35 @@
+/* Altera TSE SGDMA and MSGDMA Linux driver
+ * Copyright (C) 2014 Altera Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ALTERA_SGDMA_H__
+#define __ALTERA_SGDMA_H__
+
+void sgdma_reset(struct altera_tse_private *);
+void sgdma_enable_txirq(struct altera_tse_private *);
+void sgdma_enable_rxirq(struct altera_tse_private *);
+void sgdma_disable_rxirq(struct altera_tse_private *);
+void sgdma_disable_txirq(struct altera_tse_private *);
+void sgdma_clear_rxirq(struct altera_tse_private *);
+void sgdma_clear_txirq(struct altera_tse_private *);
+int sgdma_tx_buffer(struct altera_tse_private *priv, struct tse_buffer *);
+u32 sgdma_tx_completions(struct altera_tse_private *);
+int sgdma_add_rx_desc(struct altera_tse_private *priv, struct tse_buffer *);
+void sgdma_status(struct altera_tse_private *);
+u32 sgdma_rx_status(struct altera_tse_private *);
+int sgdma_initialize(struct altera_tse_private *);
+void sgdma_uninitialize(struct altera_tse_private *);
+
+#endif /*  __ALTERA_SGDMA_H__ */
diff --git a/drivers/net/ethernet/altera/altera_sgdmahw.h b/drivers/net/ethernet/altera/altera_sgdmahw.h
new file mode 100644 (file)
index 0000000..ba3334f
--- /dev/null
@@ -0,0 +1,124 @@
+/* Altera TSE SGDMA and MSGDMA Linux driver
+ * Copyright (C) 2014 Altera Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ALTERA_SGDMAHW_H__
+#define __ALTERA_SGDMAHW_H__
+
+/* SGDMA descriptor structure */
+struct sgdma_descrip {
+       unsigned int    raddr; /* address of data to be read */
+       unsigned int    pad1;
+       unsigned int    waddr;
+       unsigned int    pad2;
+       unsigned int    next;
+       unsigned int    pad3;
+       unsigned short  bytes;
+       unsigned char   rburst;
+       unsigned char   wburst;
+       unsigned short  bytes_xferred;  /* 16 bits, bytes xferred */
+
+       /* bit 0: error
+        * bit 1: length error
+        * bit 2: crc error
+        * bit 3: truncated error
+        * bit 4: phy error
+        * bit 5: collision error
+        * bit 6: reserved
+        * bit 7: status eop for recv case
+        */
+       unsigned char   status;
+
+       /* bit 0: eop
+        * bit 1: read_fixed
+        * bit 2: write fixed
+        * bits 3,4,5,6: Channel (always 0)
+        * bit 7: hardware owned
+        */
+       unsigned char   control;
+} __packed;
+
+
+#define SGDMA_STATUS_ERR               BIT(0)
+#define SGDMA_STATUS_LENGTH_ERR                BIT(1)
+#define SGDMA_STATUS_CRC_ERR           BIT(2)
+#define SGDMA_STATUS_TRUNC_ERR         BIT(3)
+#define SGDMA_STATUS_PHY_ERR           BIT(4)
+#define SGDMA_STATUS_COLL_ERR          BIT(5)
+#define SGDMA_STATUS_EOP               BIT(7)
+
+#define SGDMA_CONTROL_EOP              BIT(0)
+#define SGDMA_CONTROL_RD_FIXED         BIT(1)
+#define SGDMA_CONTROL_WR_FIXED         BIT(2)
+
+/* Channel is always 0, so just zero initialize it */
+
+#define SGDMA_CONTROL_HW_OWNED         BIT(7)
+
+/* SGDMA register space */
+struct sgdma_csr {
+       /* bit 0: error
+        * bit 1: eop
+        * bit 2: descriptor completed
+        * bit 3: chain completed
+        * bit 4: busy
+        * remainder reserved
+        */
+       u32     status;
+       u32     pad1[3];
+
+       /* bit 0: interrupt on error
+        * bit 1: interrupt on eop
+        * bit 2: interrupt after every descriptor
+        * bit 3: interrupt after last descrip in a chain
+        * bit 4: global interrupt enable
+        * bit 5: starts descriptor processing
+        * bit 6: stop core on dma error
+        * bit 7: interrupt on max descriptors
+        * bits 8-15: max descriptors to generate interrupt
+        * bit 16: Software reset
+        * bit 17: clears owned by hardware if 0, does not clear otherwise
+        * bit 18: enables descriptor polling mode
+        * bit 19-26: clocks before polling again
+        * bit 27-30: reserved
+        * bit 31: clear interrupt
+        */
+       u32     control;
+       u32     pad2[3];
+       u32     next_descrip;
+       u32     pad3[3];
+};
+
+
+#define SGDMA_STSREG_ERR       BIT(0) /* Error */
+#define SGDMA_STSREG_EOP       BIT(1) /* EOP */
+#define SGDMA_STSREG_DESCRIP   BIT(2) /* Descriptor completed */
+#define SGDMA_STSREG_CHAIN     BIT(3) /* Chain completed */
+#define SGDMA_STSREG_BUSY      BIT(4) /* Controller busy */
+
+#define SGDMA_CTRLREG_IOE      BIT(0) /* Interrupt on error */
+#define SGDMA_CTRLREG_IOEOP    BIT(1) /* Interrupt on EOP */
+#define SGDMA_CTRLREG_IDESCRIP BIT(2) /* Interrupt after every descriptor */
+#define SGDMA_CTRLREG_ILASTD   BIT(3) /* Interrupt after last descriptor */
+#define SGDMA_CTRLREG_INTEN    BIT(4) /* Global Interrupt enable */
+#define SGDMA_CTRLREG_START    BIT(5) /* starts descriptor processing */
+#define SGDMA_CTRLREG_STOPERR  BIT(6) /* stop on dma error */
+#define SGDMA_CTRLREG_INTMAX   BIT(7) /* Interrupt on max descriptors */
+#define SGDMA_CTRLREG_RESET    BIT(16)/* Software reset */
+#define SGDMA_CTRLREG_COBHW    BIT(17)/* Clears owned by hardware */
+#define SGDMA_CTRLREG_POLL     BIT(18)/* enables descriptor polling mode */
+#define SGDMA_CTRLREG_CLRINT   BIT(31)/* Clears interrupt */
+
+#endif /* __ALTERA_SGDMAHW_H__ */
diff --git a/drivers/net/ethernet/altera/altera_tse.h b/drivers/net/ethernet/altera/altera_tse.h
new file mode 100644 (file)
index 0000000..8feeed0
--- /dev/null
@@ -0,0 +1,486 @@
+/* Altera Triple-Speed Ethernet MAC driver
+ * Copyright (C) 2008-2014 Altera Corporation. All rights reserved
+ *
+ * Contributors:
+ *   Dalon Westergreen
+ *   Thomas Chou
+ *   Ian Abbott
+ *   Yuriy Kozlov
+ *   Tobias Klauser
+ *   Andriy Smolskyy
+ *   Roman Bulgakov
+ *   Dmytro Mytarchuk
+ *   Matthew Gerlach
+ *
+ * Original driver contributed by SLS.
+ * Major updates contributed by GlobalLogic
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ALTERA_TSE_H__
+#define __ALTERA_TSE_H__
+
+#define ALTERA_TSE_RESOURCE_NAME       "altera_tse"
+
+#include <linux/bitops.h>
+#include <linux/if_vlan.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+
+#define ALTERA_TSE_SW_RESET_WATCHDOG_CNTR      10000
+#define ALTERA_TSE_MAC_FIFO_WIDTH              4       /* TX/RX FIFO width in
+                                                        * bytes
+                                                        */
+/* Rx FIFO default settings */
+#define ALTERA_TSE_RX_SECTION_EMPTY    16
+#define ALTERA_TSE_RX_SECTION_FULL     0
+#define ALTERA_TSE_RX_ALMOST_EMPTY     8
+#define ALTERA_TSE_RX_ALMOST_FULL      8
+
+/* Tx FIFO default settings */
+#define ALTERA_TSE_TX_SECTION_EMPTY    16
+#define ALTERA_TSE_TX_SECTION_FULL     0
+#define ALTERA_TSE_TX_ALMOST_EMPTY     8
+#define ALTERA_TSE_TX_ALMOST_FULL      3
+
+/* MAC function configuration default settings */
+#define ALTERA_TSE_TX_IPG_LENGTH       12
+
+#define GET_BIT_VALUE(v, bit)          (((v) >> (bit)) & 0x1)
+
+/* MAC Command_Config Register Bit Definitions
+ */
+#define MAC_CMDCFG_TX_ENA                      BIT(0)
+#define MAC_CMDCFG_RX_ENA                      BIT(1)
+#define MAC_CMDCFG_XON_GEN                     BIT(2)
+#define MAC_CMDCFG_ETH_SPEED                   BIT(3)
+#define MAC_CMDCFG_PROMIS_EN                   BIT(4)
+#define MAC_CMDCFG_PAD_EN                      BIT(5)
+#define MAC_CMDCFG_CRC_FWD                     BIT(6)
+#define MAC_CMDCFG_PAUSE_FWD                   BIT(7)
+#define MAC_CMDCFG_PAUSE_IGNORE                        BIT(8)
+#define MAC_CMDCFG_TX_ADDR_INS                 BIT(9)
+#define MAC_CMDCFG_HD_ENA                      BIT(10)
+#define MAC_CMDCFG_EXCESS_COL                  BIT(11)
+#define MAC_CMDCFG_LATE_COL                    BIT(12)
+#define MAC_CMDCFG_SW_RESET                    BIT(13)
+#define MAC_CMDCFG_MHASH_SEL                   BIT(14)
+#define MAC_CMDCFG_LOOP_ENA                    BIT(15)
+#define MAC_CMDCFG_TX_ADDR_SEL(v)              (((v) & 0x7) << 16)
+#define MAC_CMDCFG_MAGIC_ENA                   BIT(19)
+#define MAC_CMDCFG_SLEEP                       BIT(20)
+#define MAC_CMDCFG_WAKEUP                      BIT(21)
+#define MAC_CMDCFG_XOFF_GEN                    BIT(22)
+#define MAC_CMDCFG_CNTL_FRM_ENA                        BIT(23)
+#define MAC_CMDCFG_NO_LGTH_CHECK               BIT(24)
+#define MAC_CMDCFG_ENA_10                      BIT(25)
+#define MAC_CMDCFG_RX_ERR_DISC                 BIT(26)
+#define MAC_CMDCFG_DISABLE_READ_TIMEOUT                BIT(27)
+#define MAC_CMDCFG_CNT_RESET                   BIT(31)
+
+#define MAC_CMDCFG_TX_ENA_GET(v)               GET_BIT_VALUE(v, 0)
+#define MAC_CMDCFG_RX_ENA_GET(v)               GET_BIT_VALUE(v, 1)
+#define MAC_CMDCFG_XON_GEN_GET(v)              GET_BIT_VALUE(v, 2)
+#define MAC_CMDCFG_ETH_SPEED_GET(v)            GET_BIT_VALUE(v, 3)
+#define MAC_CMDCFG_PROMIS_EN_GET(v)            GET_BIT_VALUE(v, 4)
+#define MAC_CMDCFG_PAD_EN_GET(v)               GET_BIT_VALUE(v, 5)
+#define MAC_CMDCFG_CRC_FWD_GET(v)              GET_BIT_VALUE(v, 6)
+#define MAC_CMDCFG_PAUSE_FWD_GET(v)            GET_BIT_VALUE(v, 7)
+#define MAC_CMDCFG_PAUSE_IGNORE_GET(v)         GET_BIT_VALUE(v, 8)
+#define MAC_CMDCFG_TX_ADDR_INS_GET(v)          GET_BIT_VALUE(v, 9)
+#define MAC_CMDCFG_HD_ENA_GET(v)               GET_BIT_VALUE(v, 10)
+#define MAC_CMDCFG_EXCESS_COL_GET(v)           GET_BIT_VALUE(v, 11)
+#define MAC_CMDCFG_LATE_COL_GET(v)             GET_BIT_VALUE(v, 12)
+#define MAC_CMDCFG_SW_RESET_GET(v)             GET_BIT_VALUE(v, 13)
+#define MAC_CMDCFG_MHASH_SEL_GET(v)            GET_BIT_VALUE(v, 14)
+#define MAC_CMDCFG_LOOP_ENA_GET(v)             GET_BIT_VALUE(v, 15)
+#define MAC_CMDCFG_TX_ADDR_SEL_GET(v)          (((v) >> 16) & 0x7)
+#define MAC_CMDCFG_MAGIC_ENA_GET(v)            GET_BIT_VALUE(v, 19)
+#define MAC_CMDCFG_SLEEP_GET(v)                        GET_BIT_VALUE(v, 20)
+#define MAC_CMDCFG_WAKEUP_GET(v)               GET_BIT_VALUE(v, 21)
+#define MAC_CMDCFG_XOFF_GEN_GET(v)             GET_BIT_VALUE(v, 22)
+#define MAC_CMDCFG_CNTL_FRM_ENA_GET(v)         GET_BIT_VALUE(v, 23)
+#define MAC_CMDCFG_NO_LGTH_CHECK_GET(v)                GET_BIT_VALUE(v, 24)
+#define MAC_CMDCFG_ENA_10_GET(v)               GET_BIT_VALUE(v, 25)
+#define MAC_CMDCFG_RX_ERR_DISC_GET(v)          GET_BIT_VALUE(v, 26)
+#define MAC_CMDCFG_DISABLE_READ_TIMEOUT_GET(v) GET_BIT_VALUE(v, 27)
+#define MAC_CMDCFG_CNT_RESET_GET(v)            GET_BIT_VALUE(v, 31)
+
+/* MDIO registers within MAC register Space
+ */
+struct altera_tse_mdio {
+       u32 control;    /* PHY device operation control register */
+       u32 status;     /* PHY device operation status register */
+       u32 phy_id1;    /* Bits 31:16 of PHY identifier */
+       u32 phy_id2;    /* Bits 15:0 of PHY identifier */
+       u32 auto_negotiation_advertisement;     /* Auto-negotiation
+                                                        * advertisement
+                                                        * register
+                                                        */
+       u32 remote_partner_base_page_ability;
+
+       u32 reg6;
+       u32 reg7;
+       u32 reg8;
+       u32 reg9;
+       u32 rega;
+       u32 regb;
+       u32 regc;
+       u32 regd;
+       u32 rege;
+       u32 regf;
+       u32 reg10;
+       u32 reg11;
+       u32 reg12;
+       u32 reg13;
+       u32 reg14;
+       u32 reg15;
+       u32 reg16;
+       u32 reg17;
+       u32 reg18;
+       u32 reg19;
+       u32 reg1a;
+       u32 reg1b;
+       u32 reg1c;
+       u32 reg1d;
+       u32 reg1e;
+       u32 reg1f;
+};
+
+/* MAC register Space. Note that some of these registers may or may not be
+ * present depending upon options chosen by the user when the core was
+ * configured and built. Please consult the Altera Triple Speed Ethernet User
+ * Guide for details.
+ */
+struct altera_tse_mac {
+       /* Bits 15:0: MegaCore function revision (0x0800). Bit 31:16: Customer
+        * specific revision
+        */
+       u32 megacore_revision;
+       /* Provides a memory location for user applications to test the device
+        * memory operation.
+        */
+       u32 scratch_pad;
+       /* The host processor uses this register to control and configure the
+        * MAC block
+        */
+       u32 command_config;
+       /* 32-bit primary MAC address word 0 bits 0 to 31 of the primary
+        * MAC address
+        */
+       u32 mac_addr_0;
+       /* 32-bit primary MAC address word 1 bits 32 to 47 of the primary
+        * MAC address
+        */
+       u32 mac_addr_1;
+       /* 14-bit maximum frame length. The MAC receive logic */
+       u32 frm_length;
+       /* The pause quanta is used in each pause frame sent to a remote
+        * Ethernet device, in increments of 512 Ethernet bit times
+        */
+       u32 pause_quanta;
+       /* 12-bit receive FIFO section-empty threshold */
+       u32 rx_section_empty;
+       /* 12-bit receive FIFO section-full threshold */
+       u32 rx_section_full;
+       /* 12-bit transmit FIFO section-empty threshold */
+       u32 tx_section_empty;
+       /* 12-bit transmit FIFO section-full threshold */
+       u32 tx_section_full;
+       /* 12-bit receive FIFO almost-empty threshold */
+       u32 rx_almost_empty;
+       /* 12-bit receive FIFO almost-full threshold */
+       u32 rx_almost_full;
+       /* 12-bit transmit FIFO almost-empty threshold */
+       u32 tx_almost_empty;
+       /* 12-bit transmit FIFO almost-full threshold */
+       u32 tx_almost_full;
+       /* MDIO address of PHY Device 0. Bits 0 to 4 hold a 5-bit PHY address */
+       u32 mdio_phy0_addr;
+       /* MDIO address of PHY Device 1. Bits 0 to 4 hold a 5-bit PHY address */
+       u32 mdio_phy1_addr;
+
+       /* Bit[15:0]—16-bit holdoff quanta */
+       u32 holdoff_quant;
+
+       /* only if 100/1000 BaseX PCS, reserved otherwise */
+       u32 reserved1[5];
+
+       /* Minimum IPG between consecutive transmit frame in terms of bytes */
+       u32 tx_ipg_length;
+
+       /* IEEE 802.3 oEntity Managed Object Support */
+
+       /* The MAC addresses */
+       u32 mac_id_1;
+       u32 mac_id_2;
+
+       /* Number of frames transmitted without error including pause frames */
+       u32 frames_transmitted_ok;
+       /* Number of frames received without error including pause frames */
+       u32 frames_received_ok;
+       /* Number of frames received with a CRC error */
+       u32 frames_check_sequence_errors;
+       /* Frame received with an alignment error */
+       u32 alignment_errors;
+       /* Sum of payload and padding octets of frames transmitted without
+        * error
+        */
+       u32 octets_transmitted_ok;
+       /* Sum of payload and padding octets of frames received without error */
+       u32 octets_received_ok;
+
+       /* IEEE 802.3 oPausedEntity Managed Object Support */
+
+       /* Number of transmitted pause frames */
+       u32 tx_pause_mac_ctrl_frames;
+       /* Number of Received pause frames */
+       u32 rx_pause_mac_ctrl_frames;
+
+       /* IETF MIB (MIB-II) Object Support */
+
+       /* Number of frames received with error */
+       u32 if_in_errors;
+       /* Number of frames transmitted with error */
+       u32 if_out_errors;
+       /* Number of valid received unicast frames */
+       u32 if_in_ucast_pkts;
+       /* Number of valid received multicasts frames (without pause) */
+       u32 if_in_multicast_pkts;
+       /* Number of valid received broadcast frames */
+       u32 if_in_broadcast_pkts;
+       u32 if_out_discards;
+       /* The number of valid unicast frames transmitted */
+       u32 if_out_ucast_pkts;
+       /* The number of valid multicast frames transmitted,
+        * excluding pause frames
+        */
+       u32 if_out_multicast_pkts;
+       u32 if_out_broadcast_pkts;
+
+       /* IETF RMON MIB Object Support */
+
+       /* Counts the number of dropped packets due to internal errors
+        * of the MAC client.
+        */
+       u32 ether_stats_drop_events;
+       /* Total number of bytes received. Good and bad frames. */
+       u32 ether_stats_octets;
+       /* Total number of packets received. Counts good and bad packets. */
+       u32 ether_stats_pkts;
+       /* Number of packets received with less than 64 bytes. */
+       u32 ether_stats_undersize_pkts;
+       /* The number of frames received that are longer than the
+        * value configured in the frm_length register
+        */
+       u32 ether_stats_oversize_pkts;
+       /* Number of received packet with 64 bytes */
+       u32 ether_stats_pkts_64_octets;
+       /* Frames (good and bad) with 65 to 127 bytes */
+       u32 ether_stats_pkts_65to127_octets;
+       /* Frames (good and bad) with 128 to 255 bytes */
+       u32 ether_stats_pkts_128to255_octets;
+       /* Frames (good and bad) with 256 to 511 bytes */
+       u32 ether_stats_pkts_256to511_octets;
+       /* Frames (good and bad) with 512 to 1023 bytes */
+       u32 ether_stats_pkts_512to1023_octets;
+       /* Frames (good and bad) with 1024 to 1518 bytes */
+       u32 ether_stats_pkts_1024to1518_octets;
+
+       /* Any frame length from 1519 to the maximum length configured in the
+        * frm_length register, if it is greater than 1518
+        */
+       u32 ether_stats_pkts_1519tox_octets;
+       /* Too long frames with CRC error */
+       u32 ether_stats_jabbers;
+       /* Too short frames with CRC error */
+       u32 ether_stats_fragments;
+
+       u32 reserved2;
+
+       /* FIFO control register */
+       u32 tx_cmd_stat;
+       u32 rx_cmd_stat;
+
+       /* Extended Statistics Counters */
+       u32 msb_octets_transmitted_ok;
+       u32 msb_octets_received_ok;
+       u32 msb_ether_stats_octets;
+
+       u32 reserved3;
+
+       /* Multicast address resolution table, mapped in the controller address
+        * space
+        */
+       u32 hash_table[64];
+
+       /* Registers 0 to 31 within PHY device 0/1 connected to the MDIO PHY
+        * management interface
+        */
+       struct altera_tse_mdio mdio_phy0;
+       struct altera_tse_mdio mdio_phy1;
+
+       /* 4 Supplemental MAC Addresses */
+       u32 supp_mac_addr_0_0;
+       u32 supp_mac_addr_0_1;
+       u32 supp_mac_addr_1_0;
+       u32 supp_mac_addr_1_1;
+       u32 supp_mac_addr_2_0;
+       u32 supp_mac_addr_2_1;
+       u32 supp_mac_addr_3_0;
+       u32 supp_mac_addr_3_1;
+
+       u32 reserved4[8];
+
+       /* IEEE 1588v2 Feature */
+       u32 tx_period;
+       u32 tx_adjust_fns;
+       u32 tx_adjust_ns;
+       u32 rx_period;
+       u32 rx_adjust_fns;
+       u32 rx_adjust_ns;
+
+       u32 reserved5[42];
+};
+
+/* Transmit and Receive Command Registers Bit Definitions
+ */
+#define ALTERA_TSE_TX_CMD_STAT_OMIT_CRC                BIT(17)
+#define ALTERA_TSE_TX_CMD_STAT_TX_SHIFT16      BIT(18)
+#define ALTERA_TSE_RX_CMD_STAT_RX_SHIFT16      BIT(25)
+
+/* Wrapper around a pointer to a socket buffer,
+ * so a DMA handle can be stored along with the buffer
+ */
+struct tse_buffer {
+       struct list_head lh;
+       struct sk_buff *skb;
+       dma_addr_t dma_addr;
+       u32 len;
+       int mapped_as_page;
+};
+
+struct altera_tse_private;
+
+#define ALTERA_DTYPE_SGDMA 1
+#define ALTERA_DTYPE_MSGDMA 2
+
+/* standard DMA interface for SGDMA and MSGDMA */
+struct altera_dmaops {
+       int altera_dtype;
+       int dmamask;
+       void (*reset_dma)(struct altera_tse_private *);
+       void (*enable_txirq)(struct altera_tse_private *);
+       void (*enable_rxirq)(struct altera_tse_private *);
+       void (*disable_txirq)(struct altera_tse_private *);
+       void (*disable_rxirq)(struct altera_tse_private *);
+       void (*clear_txirq)(struct altera_tse_private *);
+       void (*clear_rxirq)(struct altera_tse_private *);
+       int (*tx_buffer)(struct altera_tse_private *, struct tse_buffer *);
+       u32 (*tx_completions)(struct altera_tse_private *);
+       int (*add_rx_desc)(struct altera_tse_private *, struct tse_buffer *);
+       u32 (*get_rx_status)(struct altera_tse_private *);
+       int (*init_dma)(struct altera_tse_private *);
+       void (*uninit_dma)(struct altera_tse_private *);
+};
+
+/* This structure is private to each device.
+ */
+struct altera_tse_private {
+       struct net_device *dev;
+       struct device *device;
+       struct napi_struct napi;
+
+       /* MAC address space */
+       struct altera_tse_mac __iomem *mac_dev;
+
+       /* TSE Revision */
+       u32     revision;
+
+       /* mSGDMA Rx Dispatcher address space */
+       void __iomem *rx_dma_csr;
+       void __iomem *rx_dma_desc;
+       void __iomem *rx_dma_resp;
+
+       /* mSGDMA Tx Dispatcher address space */
+       void __iomem *tx_dma_csr;
+       void __iomem *tx_dma_desc;
+
+       /* Rx buffers queue */
+       struct tse_buffer *rx_ring;
+       u32 rx_cons;
+       u32 rx_prod;
+       u32 rx_ring_size;
+       u32 rx_dma_buf_sz;
+
+       /* Tx ring buffer */
+       struct tse_buffer *tx_ring;
+       u32 tx_prod;
+       u32 tx_cons;
+       u32 tx_ring_size;
+
+       /* Interrupts */
+       u32 tx_irq;
+       u32 rx_irq;
+
+       /* RX/TX MAC FIFO configs */
+       u32 tx_fifo_depth;
+       u32 rx_fifo_depth;
+       u32 max_mtu;
+
+       /* Hash filter settings */
+       u32 hash_filter;
+       u32 added_unicast;
+
+       /* Descriptor memory info for managing SGDMA */
+       u32 txdescmem;
+       u32 rxdescmem;
+       dma_addr_t rxdescmem_busaddr;
+       dma_addr_t txdescmem_busaddr;
+       u32 txctrlreg;
+       u32 rxctrlreg;
+       dma_addr_t rxdescphys;
+       dma_addr_t txdescphys;
+
+       struct list_head txlisthd;
+       struct list_head rxlisthd;
+
+       /* MAC command_config register protection */
+       spinlock_t mac_cfg_lock;
+       /* Tx path protection */
+       spinlock_t tx_lock;
+       /* Rx DMA & interrupt control protection */
+       spinlock_t rxdma_irq_lock;
+
+       /* PHY */
+       int phy_addr;           /* PHY's MDIO address, -1 for autodetection */
+       phy_interface_t phy_iface;
+       struct mii_bus *mdio;
+       struct phy_device *phydev;
+       int oldspeed;
+       int oldduplex;
+       int oldlink;
+
+       /* ethtool msglvl option */
+       u32 msg_enable;
+
+       struct altera_dmaops *dmaops;
+};
+
+/* Function prototypes
+ */
+void altera_tse_set_ethtool_ops(struct net_device *);
+
+#endif /* __ALTERA_TSE_H__ */
diff --git a/drivers/net/ethernet/altera/altera_tse_ethtool.c b/drivers/net/ethernet/altera/altera_tse_ethtool.c
new file mode 100644 (file)
index 0000000..63ac5f4
--- /dev/null
@@ -0,0 +1,227 @@
+/* Ethtool support for Altera Triple-Speed Ethernet MAC driver
+ * Copyright (C) 2008-2014 Altera Corporation. All rights reserved
+ *
+ * Contributors:
+ *   Dalon Westergreen
+ *   Thomas Chou
+ *   Ian Abbott
+ *   Yuriy Kozlov
+ *   Tobias Klauser
+ *   Andriy Smolskyy
+ *   Roman Bulgakov
+ *   Dmytro Mytarchuk
+ *
+ * Original driver contributed by SLS.
+ * Major updates contributed by GlobalLogic
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/ethtool.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+
+#include "altera_tse.h"
+
+#define TSE_STATS_LEN  31
+#define TSE_NUM_REGS   128
+
+static char const stat_gstrings[][ETH_GSTRING_LEN] = {
+       "tx_packets",
+       "rx_packets",
+       "rx_crc_errors",
+       "rx_align_errors",
+       "tx_bytes",
+       "rx_bytes",
+       "tx_pause",
+       "rx_pause",
+       "rx_errors",
+       "tx_errors",
+       "rx_unicast",
+       "rx_multicast",
+       "rx_broadcast",
+       "tx_discards",
+       "tx_unicast",
+       "tx_multicast",
+       "tx_broadcast",
+       "ether_drops",
+       "rx_total_bytes",
+       "rx_total_packets",
+       "rx_undersize",
+       "rx_oversize",
+       "rx_64_bytes",
+       "rx_65_127_bytes",
+       "rx_128_255_bytes",
+       "rx_256_511_bytes",
+       "rx_512_1023_bytes",
+       "rx_1024_1518_bytes",
+       "rx_gte_1519_bytes",
+       "rx_jabbers",
+       "rx_runts",
+};
+
+static void tse_get_drvinfo(struct net_device *dev,
+                           struct ethtool_drvinfo *info)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       u32 rev = ioread32(&priv->mac_dev->megacore_revision);
+
+       strcpy(info->driver, "Altera TSE MAC IP Driver");
+       strcpy(info->version, "v8.0");
+       snprintf(info->fw_version, ETHTOOL_FWVERS_LEN, "v%d.%d",
+                rev & 0xFFFF, (rev & 0xFFFF0000) >> 16);
+       sprintf(info->bus_info, "platform");
+}
+
+/* Fill in a buffer with the strings which correspond to the
+ * stats
+ */
+static void tse_gstrings(struct net_device *dev, u32 stringset, u8 *buf)
+{
+       memcpy(buf, stat_gstrings, TSE_STATS_LEN * ETH_GSTRING_LEN);
+}
+
+static void tse_fill_stats(struct net_device *dev, struct ethtool_stats *dummy,
+                          u64 *buf)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       struct altera_tse_mac *mac = priv->mac_dev;
+       u64 ext;
+
+       buf[0] = ioread32(&mac->frames_transmitted_ok);
+       buf[1] = ioread32(&mac->frames_received_ok);
+       buf[2] = ioread32(&mac->frames_check_sequence_errors);
+       buf[3] = ioread32(&mac->alignment_errors);
+
+       /* Extended aOctetsTransmittedOK counter */
+       ext = (u64) ioread32(&mac->msb_octets_transmitted_ok) << 32;
+       ext |= ioread32(&mac->octets_transmitted_ok);
+       buf[4] = ext;
+
+       /* Extended aOctetsReceivedOK counter */
+       ext = (u64) ioread32(&mac->msb_octets_received_ok) << 32;
+       ext |= ioread32(&mac->octets_received_ok);
+       buf[5] = ext;
+
+       buf[6] = ioread32(&mac->tx_pause_mac_ctrl_frames);
+       buf[7] = ioread32(&mac->rx_pause_mac_ctrl_frames);
+       buf[8] = ioread32(&mac->if_in_errors);
+       buf[9] = ioread32(&mac->if_out_errors);
+       buf[10] = ioread32(&mac->if_in_ucast_pkts);
+       buf[11] = ioread32(&mac->if_in_multicast_pkts);
+       buf[12] = ioread32(&mac->if_in_broadcast_pkts);
+       buf[13] = ioread32(&mac->if_out_discards);
+       buf[14] = ioread32(&mac->if_out_ucast_pkts);
+       buf[15] = ioread32(&mac->if_out_multicast_pkts);
+       buf[16] = ioread32(&mac->if_out_broadcast_pkts);
+       buf[17] = ioread32(&mac->ether_stats_drop_events);
+
+       /* Extended etherStatsOctets counter */
+       ext = (u64) ioread32(&mac->msb_ether_stats_octets) << 32;
+       ext |= ioread32(&mac->ether_stats_octets);
+       buf[18] = ext;
+
+       buf[19] = ioread32(&mac->ether_stats_pkts);
+       buf[20] = ioread32(&mac->ether_stats_undersize_pkts);
+       buf[21] = ioread32(&mac->ether_stats_oversize_pkts);
+       buf[22] = ioread32(&mac->ether_stats_pkts_64_octets);
+       buf[23] = ioread32(&mac->ether_stats_pkts_65to127_octets);
+       buf[24] = ioread32(&mac->ether_stats_pkts_128to255_octets);
+       buf[25] = ioread32(&mac->ether_stats_pkts_256to511_octets);
+       buf[26] = ioread32(&mac->ether_stats_pkts_512to1023_octets);
+       buf[27] = ioread32(&mac->ether_stats_pkts_1024to1518_octets);
+       buf[28] = ioread32(&mac->ether_stats_pkts_1519tox_octets);
+       buf[29] = ioread32(&mac->ether_stats_jabbers);
+       buf[30] = ioread32(&mac->ether_stats_fragments);
+}
+
+static int tse_sset_count(struct net_device *dev, int sset)
+{
+       switch (sset) {
+       case ETH_SS_STATS:
+               return TSE_STATS_LEN;
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
+static u32 tse_get_msglevel(struct net_device *dev)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       return priv->msg_enable;
+}
+
+static void tse_set_msglevel(struct net_device *dev, uint32_t data)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       priv->msg_enable = data;
+}
+
+static int tse_reglen(struct net_device *dev)
+{
+       return TSE_NUM_REGS * sizeof(u32);
+}
+
+static void tse_get_regs(struct net_device *dev, struct ethtool_regs *regs,
+                        void *regbuf)
+{
+       int i;
+       struct altera_tse_private *priv = netdev_priv(dev);
+       u32 *tse_mac_regs = (u32 *)priv->mac_dev;
+       u32 *buf = regbuf;
+
+       for (i = 0; i < TSE_NUM_REGS; i++)
+               buf[i] = ioread32(&tse_mac_regs[i]);
+}
+
+static int tse_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       struct phy_device *phydev = priv->phydev;
+
+       if (phydev == NULL)
+               return -ENODEV;
+
+       return phy_ethtool_gset(phydev, cmd);
+}
+
+static int tse_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       struct phy_device *phydev = priv->phydev;
+
+       if (phydev == NULL)
+               return -ENODEV;
+
+       return phy_ethtool_sset(phydev, cmd);
+}
+
+static const struct ethtool_ops tse_ethtool_ops = {
+       .get_drvinfo = tse_get_drvinfo,
+       .get_regs_len = tse_reglen,
+       .get_regs = tse_get_regs,
+       .get_link = ethtool_op_get_link,
+       .get_settings = tse_get_settings,
+       .set_settings = tse_set_settings,
+       .get_strings = tse_gstrings,
+       .get_sset_count = tse_sset_count,
+       .get_ethtool_stats = tse_fill_stats,
+       .get_msglevel = tse_get_msglevel,
+       .set_msglevel = tse_set_msglevel,
+};
+
+void altera_tse_set_ethtool_ops(struct net_device *netdev)
+{
+       SET_ETHTOOL_OPS(netdev, &tse_ethtool_ops);
+}
diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c
new file mode 100644 (file)
index 0000000..6006ef2
--- /dev/null
@@ -0,0 +1,1543 @@
+/* Altera Triple-Speed Ethernet MAC driver
+ * Copyright (C) 2008-2014 Altera Corporation. All rights reserved
+ *
+ * Contributors:
+ *   Dalon Westergreen
+ *   Thomas Chou
+ *   Ian Abbott
+ *   Yuriy Kozlov
+ *   Tobias Klauser
+ *   Andriy Smolskyy
+ *   Roman Bulgakov
+ *   Dmytro Mytarchuk
+ *   Matthew Gerlach
+ *
+ * Original driver contributed by SLS.
+ * Major updates contributed by GlobalLogic
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/atomic.h>
+#include <linux/delay.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/of_device.h>
+#include <linux/of_mdio.h>
+#include <linux/of_net.h>
+#include <linux/of_platform.h>
+#include <linux/phy.h>
+#include <linux/platform_device.h>
+#include <linux/skbuff.h>
+#include <asm/cacheflush.h>
+
+#include "altera_utils.h"
+#include "altera_tse.h"
+#include "altera_sgdma.h"
+#include "altera_msgdma.h"
+
+static atomic_t instance_count = ATOMIC_INIT(~0);
+/* Module parameters */
+static int debug = -1;
+module_param(debug, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(debug, "Message Level (-1: default, 0: no output, 16: all)");
+
+static const u32 default_msg_level = (NETIF_MSG_DRV | NETIF_MSG_PROBE |
+                                       NETIF_MSG_LINK | NETIF_MSG_IFUP |
+                                       NETIF_MSG_IFDOWN);
+
+#define RX_DESCRIPTORS 64
+static int dma_rx_num = RX_DESCRIPTORS;
+module_param(dma_rx_num, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dma_rx_num, "Number of descriptors in the RX list");
+
+#define TX_DESCRIPTORS 64
+static int dma_tx_num = TX_DESCRIPTORS;
+module_param(dma_tx_num, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dma_tx_num, "Number of descriptors in the TX list");
+
+
+#define POLL_PHY (-1)
+
+/* Make sure DMA buffer size is larger than the max frame size
+ * plus some alignment offset and a VLAN header. If the max frame size is
+ * 1518, a VLAN header would be additional 4 bytes and additional
+ * headroom for alignment is 2 bytes, 2048 is just fine.
+ */
+#define ALTERA_RXDMABUFFER_SIZE        2048
+
+/* Allow network stack to resume queueing packets after we've
+ * finished transmitting at least 1/4 of the packets in the queue.
+ */
+#define TSE_TX_THRESH(x)       (x->tx_ring_size / 4)
+
+#define TXQUEUESTOP_THRESHHOLD 2
+
+static struct of_device_id altera_tse_ids[];
+
+static inline u32 tse_tx_avail(struct altera_tse_private *priv)
+{
+       return priv->tx_cons + priv->tx_ring_size - priv->tx_prod - 1;
+}
+
+/* MDIO specific functions
+ */
+static int altera_tse_mdio_read(struct mii_bus *bus, int mii_id, int regnum)
+{
+       struct altera_tse_mac *mac = (struct altera_tse_mac *)bus->priv;
+       unsigned int *mdio_regs = (unsigned int *)&mac->mdio_phy0;
+       u32 data;
+
+       /* set MDIO address */
+       iowrite32((mii_id & 0x1f), &mac->mdio_phy0_addr);
+
+       /* get the data */
+       data = ioread32(&mdio_regs[regnum]) & 0xffff;
+       return data;
+}
+
+static int altera_tse_mdio_write(struct mii_bus *bus, int mii_id, int regnum,
+                                u16 value)
+{
+       struct altera_tse_mac *mac = (struct altera_tse_mac *)bus->priv;
+       unsigned int *mdio_regs = (unsigned int *)&mac->mdio_phy0;
+
+       /* set MDIO address */
+       iowrite32((mii_id & 0x1f), &mac->mdio_phy0_addr);
+
+       /* write the data */
+       iowrite32((u32) value, &mdio_regs[regnum]);
+       return 0;
+}
+
+static int altera_tse_mdio_create(struct net_device *dev, unsigned int id)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       int ret;
+       int i;
+       struct device_node *mdio_node = NULL;
+       struct mii_bus *mdio = NULL;
+       struct device_node *child_node = NULL;
+
+       for_each_child_of_node(priv->device->of_node, child_node) {
+               if (of_device_is_compatible(child_node, "altr,tse-mdio")) {
+                       mdio_node = child_node;
+                       break;
+               }
+       }
+
+       if (mdio_node) {
+               netdev_dbg(dev, "FOUND MDIO subnode\n");
+       } else {
+               netdev_dbg(dev, "NO MDIO subnode\n");
+               return 0;
+       }
+
+       mdio = mdiobus_alloc();
+       if (mdio == NULL) {
+               netdev_err(dev, "Error allocating MDIO bus\n");
+               return -ENOMEM;
+       }
+
+       mdio->name = ALTERA_TSE_RESOURCE_NAME;
+       mdio->read = &altera_tse_mdio_read;
+       mdio->write = &altera_tse_mdio_write;
+       snprintf(mdio->id, MII_BUS_ID_SIZE, "%s-%u", mdio->name, id);
+
+       mdio->irq = kcalloc(PHY_MAX_ADDR, sizeof(int), GFP_KERNEL);
+       if (mdio->irq == NULL) {
+               ret = -ENOMEM;
+               goto out_free_mdio;
+       }
+       for (i = 0; i < PHY_MAX_ADDR; i++)
+               mdio->irq[i] = PHY_POLL;
+
+       mdio->priv = priv->mac_dev;
+       mdio->parent = priv->device;
+
+       ret = of_mdiobus_register(mdio, mdio_node);
+       if (ret != 0) {
+               netdev_err(dev, "Cannot register MDIO bus %s\n",
+                          mdio->id);
+               goto out_free_mdio_irq;
+       }
+
+       if (netif_msg_drv(priv))
+               netdev_info(dev, "MDIO bus %s: created\n", mdio->id);
+
+       priv->mdio = mdio;
+       return 0;
+out_free_mdio_irq:
+       kfree(mdio->irq);
+out_free_mdio:
+       mdiobus_free(mdio);
+       mdio = NULL;
+       return ret;
+}
+
+static void altera_tse_mdio_destroy(struct net_device *dev)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+
+       if (priv->mdio == NULL)
+               return;
+
+       if (netif_msg_drv(priv))
+               netdev_info(dev, "MDIO bus %s: removed\n",
+                           priv->mdio->id);
+
+       mdiobus_unregister(priv->mdio);
+       kfree(priv->mdio->irq);
+       mdiobus_free(priv->mdio);
+       priv->mdio = NULL;
+}
+
+static int tse_init_rx_buffer(struct altera_tse_private *priv,
+                             struct tse_buffer *rxbuffer, int len)
+{
+       rxbuffer->skb = netdev_alloc_skb_ip_align(priv->dev, len);
+       if (!rxbuffer->skb)
+               return -ENOMEM;
+
+       rxbuffer->dma_addr = dma_map_single(priv->device, rxbuffer->skb->data,
+                                               len,
+                                               DMA_FROM_DEVICE);
+
+       if (dma_mapping_error(priv->device, rxbuffer->dma_addr)) {
+               netdev_err(priv->dev, "%s: DMA mapping error\n", __func__);
+               dev_kfree_skb_any(rxbuffer->skb);
+               return -EINVAL;
+       }
+       rxbuffer->len = len;
+       return 0;
+}
+
+static void tse_free_rx_buffer(struct altera_tse_private *priv,
+                              struct tse_buffer *rxbuffer)
+{
+       struct sk_buff *skb = rxbuffer->skb;
+       dma_addr_t dma_addr = rxbuffer->dma_addr;
+
+       if (skb != NULL) {
+               if (dma_addr)
+                       dma_unmap_single(priv->device, dma_addr,
+                                        rxbuffer->len,
+                                        DMA_FROM_DEVICE);
+               dev_kfree_skb_any(skb);
+               rxbuffer->skb = NULL;
+               rxbuffer->dma_addr = 0;
+       }
+}
+
+/* Unmap and free Tx buffer resources
+ */
+static void tse_free_tx_buffer(struct altera_tse_private *priv,
+                              struct tse_buffer *buffer)
+{
+       if (buffer->dma_addr) {
+               if (buffer->mapped_as_page)
+                       dma_unmap_page(priv->device, buffer->dma_addr,
+                                      buffer->len, DMA_TO_DEVICE);
+               else
+                       dma_unmap_single(priv->device, buffer->dma_addr,
+                                        buffer->len, DMA_TO_DEVICE);
+               buffer->dma_addr = 0;
+       }
+       if (buffer->skb) {
+               dev_kfree_skb_any(buffer->skb);
+               buffer->skb = NULL;
+       }
+}
+
+static int alloc_init_skbufs(struct altera_tse_private *priv)
+{
+       unsigned int rx_descs = priv->rx_ring_size;
+       unsigned int tx_descs = priv->tx_ring_size;
+       int ret = -ENOMEM;
+       int i;
+
+       /* Create Rx ring buffer */
+       priv->rx_ring = kcalloc(rx_descs, sizeof(struct tse_buffer),
+                               GFP_KERNEL);
+       if (!priv->rx_ring)
+               goto err_rx_ring;
+
+       /* Create Tx ring buffer */
+       priv->tx_ring = kcalloc(tx_descs, sizeof(struct tse_buffer),
+                               GFP_KERNEL);
+       if (!priv->tx_ring)
+               goto err_tx_ring;
+
+       priv->tx_cons = 0;
+       priv->tx_prod = 0;
+
+       /* Init Rx ring */
+       for (i = 0; i < rx_descs; i++) {
+               ret = tse_init_rx_buffer(priv, &priv->rx_ring[i],
+                                        priv->rx_dma_buf_sz);
+               if (ret)
+                       goto err_init_rx_buffers;
+       }
+
+       priv->rx_cons = 0;
+       priv->rx_prod = 0;
+
+       return 0;
+err_init_rx_buffers:
+       while (--i >= 0)
+               tse_free_rx_buffer(priv, &priv->rx_ring[i]);
+       kfree(priv->tx_ring);
+err_tx_ring:
+       kfree(priv->rx_ring);
+err_rx_ring:
+       return ret;
+}
+
+static void free_skbufs(struct net_device *dev)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       unsigned int rx_descs = priv->rx_ring_size;
+       unsigned int tx_descs = priv->tx_ring_size;
+       int i;
+
+       /* Release the DMA TX/RX socket buffers */
+       for (i = 0; i < rx_descs; i++)
+               tse_free_rx_buffer(priv, &priv->rx_ring[i]);
+       for (i = 0; i < tx_descs; i++)
+               tse_free_tx_buffer(priv, &priv->tx_ring[i]);
+
+
+       kfree(priv->tx_ring);
+}
+
+/* Reallocate the skb for the reception process
+ */
+static inline void tse_rx_refill(struct altera_tse_private *priv)
+{
+       unsigned int rxsize = priv->rx_ring_size;
+       unsigned int entry;
+       int ret;
+
+       for (; priv->rx_cons - priv->rx_prod > 0;
+                       priv->rx_prod++) {
+               entry = priv->rx_prod % rxsize;
+               if (likely(priv->rx_ring[entry].skb == NULL)) {
+                       ret = tse_init_rx_buffer(priv, &priv->rx_ring[entry],
+                               priv->rx_dma_buf_sz);
+                       if (unlikely(ret != 0))
+                               break;
+                       priv->dmaops->add_rx_desc(priv, &priv->rx_ring[entry]);
+               }
+       }
+}
+
+/* Pull out the VLAN tag and fix up the packet
+ */
+static inline void tse_rx_vlan(struct net_device *dev, struct sk_buff *skb)
+{
+       struct ethhdr *eth_hdr;
+       u16 vid;
+       if ((dev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
+           !__vlan_get_tag(skb, &vid)) {
+               eth_hdr = (struct ethhdr *)skb->data;
+               memmove(skb->data + VLAN_HLEN, eth_hdr, ETH_ALEN * 2);
+               skb_pull(skb, VLAN_HLEN);
+               __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
+       }
+}
+
+/* Receive a packet: retrieve and pass over to upper levels
+ */
+static int tse_rx(struct altera_tse_private *priv, int limit)
+{
+       unsigned int count = 0;
+       unsigned int next_entry;
+       struct sk_buff *skb;
+       unsigned int entry = priv->rx_cons % priv->rx_ring_size;
+       u32 rxstatus;
+       u16 pktlength;
+       u16 pktstatus;
+
+       while ((rxstatus = priv->dmaops->get_rx_status(priv)) != 0) {
+               pktstatus = rxstatus >> 16;
+               pktlength = rxstatus & 0xffff;
+
+               if ((pktstatus & 0xFF) || (pktlength == 0))
+                       netdev_err(priv->dev,
+                                  "RCV pktstatus %08X pktlength %08X\n",
+                                  pktstatus, pktlength);
+
+               count++;
+               next_entry = (++priv->rx_cons) % priv->rx_ring_size;
+
+               skb = priv->rx_ring[entry].skb;
+               if (unlikely(!skb)) {
+                       netdev_err(priv->dev,
+                                  "%s: Inconsistent Rx descriptor chain\n",
+                                  __func__);
+                       priv->dev->stats.rx_dropped++;
+                       break;
+               }
+               priv->rx_ring[entry].skb = NULL;
+
+               skb_put(skb, pktlength);
+
+               /* make cache consistent with receive packet buffer */
+               dma_sync_single_for_cpu(priv->device,
+                                       priv->rx_ring[entry].dma_addr,
+                                       priv->rx_ring[entry].len,
+                                       DMA_FROM_DEVICE);
+
+               dma_unmap_single(priv->device, priv->rx_ring[entry].dma_addr,
+                                priv->rx_ring[entry].len, DMA_FROM_DEVICE);
+
+               if (netif_msg_pktdata(priv)) {
+                       netdev_info(priv->dev, "frame received %d bytes\n",
+                                   pktlength);
+                       print_hex_dump(KERN_ERR, "data: ", DUMP_PREFIX_OFFSET,
+                                      16, 1, skb->data, pktlength, true);
+               }
+
+               tse_rx_vlan(priv->dev, skb);
+
+               skb->protocol = eth_type_trans(skb, priv->dev);
+               skb_checksum_none_assert(skb);
+
+               napi_gro_receive(&priv->napi, skb);
+
+               priv->dev->stats.rx_packets++;
+               priv->dev->stats.rx_bytes += pktlength;
+
+               entry = next_entry;
+       }
+
+       tse_rx_refill(priv);
+       return count;
+}
+
+/* Reclaim resources after transmission completes
+ */
+static int tse_tx_complete(struct altera_tse_private *priv)
+{
+       unsigned int txsize = priv->tx_ring_size;
+       u32 ready;
+       unsigned int entry;
+       struct tse_buffer *tx_buff;
+       int txcomplete = 0;
+
+       spin_lock(&priv->tx_lock);
+
+       ready = priv->dmaops->tx_completions(priv);
+
+       /* Free sent buffers */
+       while (ready && (priv->tx_cons != priv->tx_prod)) {
+               entry = priv->tx_cons % txsize;
+               tx_buff = &priv->tx_ring[entry];
+
+               if (netif_msg_tx_done(priv))
+                       netdev_dbg(priv->dev, "%s: curr %d, dirty %d\n",
+                                  __func__, priv->tx_prod, priv->tx_cons);
+
+               if (likely(tx_buff->skb))
+                       priv->dev->stats.tx_packets++;
+
+               tse_free_tx_buffer(priv, tx_buff);
+               priv->tx_cons++;
+
+               txcomplete++;
+               ready--;
+       }
+
+       if (unlikely(netif_queue_stopped(priv->dev) &&
+                    tse_tx_avail(priv) > TSE_TX_THRESH(priv))) {
+               netif_tx_lock(priv->dev);
+               if (netif_queue_stopped(priv->dev) &&
+                   tse_tx_avail(priv) > TSE_TX_THRESH(priv)) {
+                       if (netif_msg_tx_done(priv))
+                               netdev_dbg(priv->dev, "%s: restart transmit\n",
+                                          __func__);
+                       netif_wake_queue(priv->dev);
+               }
+               netif_tx_unlock(priv->dev);
+       }
+
+       spin_unlock(&priv->tx_lock);
+       return txcomplete;
+}
+
+/* NAPI polling function
+ */
+static int tse_poll(struct napi_struct *napi, int budget)
+{
+       struct altera_tse_private *priv =
+                       container_of(napi, struct altera_tse_private, napi);
+       int rxcomplete = 0;
+       int txcomplete = 0;
+       unsigned long int flags;
+
+       txcomplete = tse_tx_complete(priv);
+
+       rxcomplete = tse_rx(priv, budget);
+
+       if (rxcomplete >= budget || txcomplete > 0)
+               return rxcomplete;
+
+       napi_gro_flush(napi, false);
+       __napi_complete(napi);
+
+       netdev_dbg(priv->dev,
+                  "NAPI Complete, did %d packets with budget %d\n",
+                  txcomplete+rxcomplete, budget);
+
+       spin_lock_irqsave(&priv->rxdma_irq_lock, flags);
+       priv->dmaops->enable_rxirq(priv);
+       priv->dmaops->enable_txirq(priv);
+       spin_unlock_irqrestore(&priv->rxdma_irq_lock, flags);
+       return rxcomplete + txcomplete;
+}
+
+/* DMA TX & RX FIFO interrupt routing
+ */
+static irqreturn_t altera_isr(int irq, void *dev_id)
+{
+       struct net_device *dev = dev_id;
+       struct altera_tse_private *priv;
+       unsigned long int flags;
+
+
+       if (unlikely(!dev)) {
+               pr_err("%s: invalid dev pointer\n", __func__);
+               return IRQ_NONE;
+       }
+       priv = netdev_priv(dev);
+
+       /* turn off desc irqs and enable napi rx */
+       spin_lock_irqsave(&priv->rxdma_irq_lock, flags);
+
+       if (likely(napi_schedule_prep(&priv->napi))) {
+               priv->dmaops->disable_rxirq(priv);
+               priv->dmaops->disable_txirq(priv);
+               __napi_schedule(&priv->napi);
+       }
+
+       /* reset IRQs */
+       priv->dmaops->clear_rxirq(priv);
+       priv->dmaops->clear_txirq(priv);
+
+       spin_unlock_irqrestore(&priv->rxdma_irq_lock, flags);
+
+       return IRQ_HANDLED;
+}
+
+/* Transmit a packet (called by the kernel). Dispatches
+ * either the SGDMA method for transmitting or the
+ * MSGDMA method, assumes no scatter/gather support,
+ * implying an assumption that there's only one
+ * physically contiguous fragment starting at
+ * skb->data, for length of skb_headlen(skb).
+ */
+static int tse_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       unsigned int txsize = priv->tx_ring_size;
+       unsigned int entry;
+       struct tse_buffer *buffer = NULL;
+       int nfrags = skb_shinfo(skb)->nr_frags;
+       unsigned int nopaged_len = skb_headlen(skb);
+       enum netdev_tx ret = NETDEV_TX_OK;
+       dma_addr_t dma_addr;
+       int txcomplete = 0;
+
+       spin_lock_bh(&priv->tx_lock);
+
+       if (unlikely(tse_tx_avail(priv) < nfrags + 1)) {
+               if (!netif_queue_stopped(dev)) {
+                       netif_stop_queue(dev);
+                       /* This is a hard error, log it. */
+                       netdev_err(priv->dev,
+                                  "%s: Tx list full when queue awake\n",
+                                  __func__);
+               }
+               ret = NETDEV_TX_BUSY;
+               goto out;
+       }
+
+       /* Map the first skb fragment */
+       entry = priv->tx_prod % txsize;
+       buffer = &priv->tx_ring[entry];
+
+       dma_addr = dma_map_single(priv->device, skb->data, nopaged_len,
+                                 DMA_TO_DEVICE);
+       if (dma_mapping_error(priv->device, dma_addr)) {
+               netdev_err(priv->dev, "%s: DMA mapping error\n", __func__);
+               ret = NETDEV_TX_OK;
+               goto out;
+       }
+
+       buffer->skb = skb;
+       buffer->dma_addr = dma_addr;
+       buffer->len = nopaged_len;
+
+       /* Push data out of the cache hierarchy into main memory */
+       dma_sync_single_for_device(priv->device, buffer->dma_addr,
+                                  buffer->len, DMA_TO_DEVICE);
+
+       txcomplete = priv->dmaops->tx_buffer(priv, buffer);
+
+       skb_tx_timestamp(skb);
+
+       priv->tx_prod++;
+       dev->stats.tx_bytes += skb->len;
+
+       if (unlikely(tse_tx_avail(priv) <= TXQUEUESTOP_THRESHHOLD)) {
+               if (netif_msg_hw(priv))
+                       netdev_dbg(priv->dev, "%s: stop transmitted packets\n",
+                                  __func__);
+               netif_stop_queue(dev);
+       }
+
+out:
+       spin_unlock_bh(&priv->tx_lock);
+
+       return ret;
+}
+
+/* Called every time the controller might need to be made
+ * aware of new link state.  The PHY code conveys this
+ * information through variables in the phydev structure, and this
+ * function converts those variables into the appropriate
+ * register values, and can bring down the device if needed.
+ */
+static void altera_tse_adjust_link(struct net_device *dev)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       struct phy_device *phydev = priv->phydev;
+       int new_state = 0;
+
+       /* only change config if there is a link */
+       spin_lock(&priv->mac_cfg_lock);
+       if (phydev->link) {
+               /* Read old config */
+               u32 cfg_reg = ioread32(&priv->mac_dev->command_config);
+
+               /* Check duplex */
+               if (phydev->duplex != priv->oldduplex) {
+                       new_state = 1;
+                       if (!(phydev->duplex))
+                               cfg_reg |= MAC_CMDCFG_HD_ENA;
+                       else
+                               cfg_reg &= ~MAC_CMDCFG_HD_ENA;
+
+                       netdev_dbg(priv->dev, "%s: Link duplex = 0x%x\n",
+                                  dev->name, phydev->duplex);
+
+                       priv->oldduplex = phydev->duplex;
+               }
+
+               /* Check speed */
+               if (phydev->speed != priv->oldspeed) {
+                       new_state = 1;
+                       switch (phydev->speed) {
+                       case 1000:
+                               cfg_reg |= MAC_CMDCFG_ETH_SPEED;
+                               cfg_reg &= ~MAC_CMDCFG_ENA_10;
+                               break;
+                       case 100:
+                               cfg_reg &= ~MAC_CMDCFG_ETH_SPEED;
+                               cfg_reg &= ~MAC_CMDCFG_ENA_10;
+                               break;
+                       case 10:
+                               cfg_reg &= ~MAC_CMDCFG_ETH_SPEED;
+                               cfg_reg |= MAC_CMDCFG_ENA_10;
+                               break;
+                       default:
+                               if (netif_msg_link(priv))
+                                       netdev_warn(dev, "Speed (%d) is not 10/100/1000!\n",
+                                                   phydev->speed);
+                               break;
+                       }
+                       priv->oldspeed = phydev->speed;
+               }
+               iowrite32(cfg_reg, &priv->mac_dev->command_config);
+
+               if (!priv->oldlink) {
+                       new_state = 1;
+                       priv->oldlink = 1;
+               }
+       } else if (priv->oldlink) {
+               new_state = 1;
+               priv->oldlink = 0;
+               priv->oldspeed = 0;
+               priv->oldduplex = -1;
+       }
+
+       if (new_state && netif_msg_link(priv))
+               phy_print_status(phydev);
+
+       spin_unlock(&priv->mac_cfg_lock);
+}
+static struct phy_device *connect_local_phy(struct net_device *dev)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       struct phy_device *phydev = NULL;
+       char phy_id_fmt[MII_BUS_ID_SIZE + 3];
+       int ret;
+
+       if (priv->phy_addr != POLL_PHY) {
+               snprintf(phy_id_fmt, MII_BUS_ID_SIZE + 3, PHY_ID_FMT,
+                        priv->mdio->id, priv->phy_addr);
+
+               netdev_dbg(dev, "trying to attach to %s\n", phy_id_fmt);
+
+               phydev = phy_connect(dev, phy_id_fmt, &altera_tse_adjust_link,
+                                    priv->phy_iface);
+               if (IS_ERR(phydev))
+                       netdev_err(dev, "Could not attach to PHY\n");
+
+       } else {
+               phydev = phy_find_first(priv->mdio);
+               if (phydev == NULL) {
+                       netdev_err(dev, "No PHY found\n");
+                       return phydev;
+               }
+
+               ret = phy_connect_direct(dev, phydev, &altera_tse_adjust_link,
+                               priv->phy_iface);
+               if (ret != 0) {
+                       netdev_err(dev, "Could not attach to PHY\n");
+                       phydev = NULL;
+               }
+       }
+       return phydev;
+}
+
+/* Initialize driver's PHY state, and attach to the PHY
+ */
+static int init_phy(struct net_device *dev)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       struct phy_device *phydev;
+       struct device_node *phynode;
+
+       priv->oldlink = 0;
+       priv->oldspeed = 0;
+       priv->oldduplex = -1;
+
+       phynode = of_parse_phandle(priv->device->of_node, "phy-handle", 0);
+
+       if (!phynode) {
+               netdev_dbg(dev, "no phy-handle found\n");
+               if (!priv->mdio) {
+                       netdev_err(dev,
+                                  "No phy-handle nor local mdio specified\n");
+                       return -ENODEV;
+               }
+               phydev = connect_local_phy(dev);
+       } else {
+               netdev_dbg(dev, "phy-handle found\n");
+               phydev = of_phy_connect(dev, phynode,
+                       &altera_tse_adjust_link, 0, priv->phy_iface);
+       }
+
+       if (!phydev) {
+               netdev_err(dev, "Could not find the PHY\n");
+               return -ENODEV;
+       }
+
+       /* Stop Advertising 1000BASE Capability if interface is not GMII
+        * Note: Checkpatch throws CHECKs for the camel case defines below,
+        * it's ok to ignore.
+        */
+       if ((priv->phy_iface == PHY_INTERFACE_MODE_MII) ||
+           (priv->phy_iface == PHY_INTERFACE_MODE_RMII))
+               phydev->advertising &= ~(SUPPORTED_1000baseT_Half |
+                                        SUPPORTED_1000baseT_Full);
+
+       /* Broken HW is sometimes missing the pull-up resistor on the
+        * MDIO line, which results in reads to non-existent devices returning
+        * 0 rather than 0xffff. Catch this here and treat 0 as a non-existent
+        * device as well.
+        * Note: phydev->phy_id is the result of reading the UID PHY registers.
+        */
+       if (phydev->phy_id == 0) {
+               netdev_err(dev, "Bad PHY UID 0x%08x\n", phydev->phy_id);
+               phy_disconnect(phydev);
+               return -ENODEV;
+       }
+
+       netdev_dbg(dev, "attached to PHY %d UID 0x%08x Link = %d\n",
+                  phydev->addr, phydev->phy_id, phydev->link);
+
+       priv->phydev = phydev;
+       return 0;
+}
+
+static void tse_update_mac_addr(struct altera_tse_private *priv, u8 *addr)
+{
+       struct altera_tse_mac *mac = priv->mac_dev;
+       u32 msb;
+       u32 lsb;
+
+       msb = (addr[3] << 24) | (addr[2] << 16) | (addr[1] << 8) | addr[0];
+       lsb = ((addr[5] << 8) | addr[4]) & 0xffff;
+
+       /* Set primary MAC address */
+       iowrite32(msb, &mac->mac_addr_0);
+       iowrite32(lsb, &mac->mac_addr_1);
+}
+
+/* MAC software reset.
+ * When reset is triggered, the MAC function completes the current
+ * transmission or reception, and subsequently disables the transmit and
+ * receive logic, flushes the receive FIFO buffer, and resets the statistics
+ * counters.
+ */
+static int reset_mac(struct altera_tse_private *priv)
+{
+       void __iomem *cmd_cfg_reg = &priv->mac_dev->command_config;
+       int counter;
+       u32 dat;
+
+       dat = ioread32(cmd_cfg_reg);
+       dat &= ~(MAC_CMDCFG_TX_ENA | MAC_CMDCFG_RX_ENA);
+       dat |= MAC_CMDCFG_SW_RESET | MAC_CMDCFG_CNT_RESET;
+       iowrite32(dat, cmd_cfg_reg);
+
+       counter = 0;
+       while (counter++ < ALTERA_TSE_SW_RESET_WATCHDOG_CNTR) {
+               if (tse_bit_is_clear(cmd_cfg_reg, MAC_CMDCFG_SW_RESET))
+                       break;
+               udelay(1);
+       }
+
+       if (counter >= ALTERA_TSE_SW_RESET_WATCHDOG_CNTR) {
+               dat = ioread32(cmd_cfg_reg);
+               dat &= ~MAC_CMDCFG_SW_RESET;
+               iowrite32(dat, cmd_cfg_reg);
+               return -1;
+       }
+       return 0;
+}
+
+/* Initialize MAC core registers
+*/
+static int init_mac(struct altera_tse_private *priv)
+{
+       struct altera_tse_mac *mac = priv->mac_dev;
+       unsigned int cmd = 0;
+       u32 frm_length;
+
+       /* Setup Rx FIFO */
+       iowrite32(priv->rx_fifo_depth - ALTERA_TSE_RX_SECTION_EMPTY,
+                 &mac->rx_section_empty);
+       iowrite32(ALTERA_TSE_RX_SECTION_FULL, &mac->rx_section_full);
+       iowrite32(ALTERA_TSE_RX_ALMOST_EMPTY, &mac->rx_almost_empty);
+       iowrite32(ALTERA_TSE_RX_ALMOST_FULL, &mac->rx_almost_full);
+
+       /* Setup Tx FIFO */
+       iowrite32(priv->tx_fifo_depth - ALTERA_TSE_TX_SECTION_EMPTY,
+                 &mac->tx_section_empty);
+       iowrite32(ALTERA_TSE_TX_SECTION_FULL, &mac->tx_section_full);
+       iowrite32(ALTERA_TSE_TX_ALMOST_EMPTY, &mac->tx_almost_empty);
+       iowrite32(ALTERA_TSE_TX_ALMOST_FULL, &mac->tx_almost_full);
+
+       /* MAC Address Configuration */
+       tse_update_mac_addr(priv, priv->dev->dev_addr);
+
+       /* MAC Function Configuration */
+       frm_length = ETH_HLEN + priv->dev->mtu + ETH_FCS_LEN;
+       iowrite32(frm_length, &mac->frm_length);
+       iowrite32(ALTERA_TSE_TX_IPG_LENGTH, &mac->tx_ipg_length);
+
+       /* Disable RX/TX shift 16 for alignment of all received frames on 16-bit
+        * start address
+        */
+       tse_clear_bit(&mac->rx_cmd_stat, ALTERA_TSE_RX_CMD_STAT_RX_SHIFT16);
+       tse_clear_bit(&mac->tx_cmd_stat, ALTERA_TSE_TX_CMD_STAT_TX_SHIFT16 |
+                                        ALTERA_TSE_TX_CMD_STAT_OMIT_CRC);
+
+       /* Set the MAC options */
+       cmd = ioread32(&mac->command_config);
+       cmd |= MAC_CMDCFG_PAD_EN;       /* Padding Removal on Receive */
+       cmd &= ~MAC_CMDCFG_CRC_FWD;     /* CRC Removal */
+       cmd |= MAC_CMDCFG_RX_ERR_DISC;  /* Automatically discard frames
+                                        * with CRC errors
+                                        */
+       cmd |= MAC_CMDCFG_CNTL_FRM_ENA;
+       cmd &= ~MAC_CMDCFG_TX_ENA;
+       cmd &= ~MAC_CMDCFG_RX_ENA;
+       iowrite32(cmd, &mac->command_config);
+
+       if (netif_msg_hw(priv))
+               dev_dbg(priv->device,
+                       "MAC post-initialization: CMD_CONFIG = 0x%08x\n", cmd);
+
+       return 0;
+}
+
+/* Start/stop MAC transmission logic
+ */
+static void tse_set_mac(struct altera_tse_private *priv, bool enable)
+{
+       struct altera_tse_mac *mac = priv->mac_dev;
+       u32 value = ioread32(&mac->command_config);
+
+       if (enable)
+               value |= MAC_CMDCFG_TX_ENA | MAC_CMDCFG_RX_ENA;
+       else
+               value &= ~(MAC_CMDCFG_TX_ENA | MAC_CMDCFG_RX_ENA);
+
+       iowrite32(value, &mac->command_config);
+}
+
+/* Change the MTU
+ */
+static int tse_change_mtu(struct net_device *dev, int new_mtu)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       unsigned int max_mtu = priv->max_mtu;
+       unsigned int min_mtu = ETH_ZLEN + ETH_FCS_LEN;
+
+       if (netif_running(dev)) {
+               netdev_err(dev, "must be stopped to change its MTU\n");
+               return -EBUSY;
+       }
+
+       if ((new_mtu < min_mtu) || (new_mtu > max_mtu)) {
+               netdev_err(dev, "invalid MTU, max MTU is: %u\n", max_mtu);
+               return -EINVAL;
+       }
+
+       dev->mtu = new_mtu;
+       netdev_update_features(dev);
+
+       return 0;
+}
+
+static void altera_tse_set_mcfilter(struct net_device *dev)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       struct altera_tse_mac *mac = (struct altera_tse_mac *)priv->mac_dev;
+       int i;
+       struct netdev_hw_addr *ha;
+
+       /* clear the hash filter */
+       for (i = 0; i < 64; i++)
+               iowrite32(0, &(mac->hash_table[i]));
+
+       netdev_for_each_mc_addr(ha, dev) {
+               unsigned int hash = 0;
+               int mac_octet;
+
+               for (mac_octet = 5; mac_octet >= 0; mac_octet--) {
+                       unsigned char xor_bit = 0;
+                       unsigned char octet = ha->addr[mac_octet];
+                       unsigned int bitshift;
+
+                       for (bitshift = 0; bitshift < 8; bitshift++)
+                               xor_bit ^= ((octet >> bitshift) & 0x01);
+
+                       hash = (hash << 1) | xor_bit;
+               }
+               iowrite32(1, &(mac->hash_table[hash]));
+       }
+}
+
+
+static void altera_tse_set_mcfilterall(struct net_device *dev)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       struct altera_tse_mac *mac = (struct altera_tse_mac *)priv->mac_dev;
+       int i;
+
+       /* set the hash filter */
+       for (i = 0; i < 64; i++)
+               iowrite32(1, &(mac->hash_table[i]));
+}
+
+/* Set or clear the multicast filter for this adaptor
+ */
+static void tse_set_rx_mode_hashfilter(struct net_device *dev)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       struct altera_tse_mac *mac = priv->mac_dev;
+
+       spin_lock(&priv->mac_cfg_lock);
+
+       if (dev->flags & IFF_PROMISC)
+               tse_set_bit(&mac->command_config, MAC_CMDCFG_PROMIS_EN);
+
+       if (dev->flags & IFF_ALLMULTI)
+               altera_tse_set_mcfilterall(dev);
+       else
+               altera_tse_set_mcfilter(dev);
+
+       spin_unlock(&priv->mac_cfg_lock);
+}
+
+/* Set or clear the multicast filter for this adaptor
+ */
+static void tse_set_rx_mode(struct net_device *dev)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       struct altera_tse_mac *mac = priv->mac_dev;
+
+       spin_lock(&priv->mac_cfg_lock);
+
+       if ((dev->flags & IFF_PROMISC) || (dev->flags & IFF_ALLMULTI) ||
+           !netdev_mc_empty(dev) || !netdev_uc_empty(dev))
+               tse_set_bit(&mac->command_config, MAC_CMDCFG_PROMIS_EN);
+       else
+               tse_clear_bit(&mac->command_config, MAC_CMDCFG_PROMIS_EN);
+
+       spin_unlock(&priv->mac_cfg_lock);
+}
+
+/* Open and initialize the interface
+ */
+static int tse_open(struct net_device *dev)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       int ret = 0;
+       int i;
+       unsigned long int flags;
+
+       /* Reset and configure TSE MAC and probe associated PHY */
+       ret = priv->dmaops->init_dma(priv);
+       if (ret != 0) {
+               netdev_err(dev, "Cannot initialize DMA\n");
+               goto phy_error;
+       }
+
+       if (netif_msg_ifup(priv))
+               netdev_warn(dev, "device MAC address %pM\n",
+                           dev->dev_addr);
+
+       if ((priv->revision < 0xd00) || (priv->revision > 0xe00))
+               netdev_warn(dev, "TSE revision %x\n", priv->revision);
+
+       spin_lock(&priv->mac_cfg_lock);
+       ret = reset_mac(priv);
+       if (ret)
+               netdev_err(dev, "Cannot reset MAC core (error: %d)\n", ret);
+
+       ret = init_mac(priv);
+       spin_unlock(&priv->mac_cfg_lock);
+       if (ret) {
+               netdev_err(dev, "Cannot init MAC core (error: %d)\n", ret);
+               goto alloc_skbuf_error;
+       }
+
+       priv->dmaops->reset_dma(priv);
+
+       /* Create and initialize the TX/RX descriptors chains. */
+       priv->rx_ring_size = dma_rx_num;
+       priv->tx_ring_size = dma_tx_num;
+       ret = alloc_init_skbufs(priv);
+       if (ret) {
+               netdev_err(dev, "DMA descriptors initialization failed\n");
+               goto alloc_skbuf_error;
+       }
+
+
+       /* Register RX interrupt */
+       ret = request_irq(priv->rx_irq, altera_isr, IRQF_SHARED,
+                         dev->name, dev);
+       if (ret) {
+               netdev_err(dev, "Unable to register RX interrupt %d\n",
+                          priv->rx_irq);
+               goto init_error;
+       }
+
+       /* Register TX interrupt */
+       ret = request_irq(priv->tx_irq, altera_isr, IRQF_SHARED,
+                         dev->name, dev);
+       if (ret) {
+               netdev_err(dev, "Unable to register TX interrupt %d\n",
+                          priv->tx_irq);
+               goto tx_request_irq_error;
+       }
+
+       /* Enable DMA interrupts */
+       spin_lock_irqsave(&priv->rxdma_irq_lock, flags);
+       priv->dmaops->enable_rxirq(priv);
+       priv->dmaops->enable_txirq(priv);
+
+       /* Setup RX descriptor chain */
+       for (i = 0; i < priv->rx_ring_size; i++)
+               priv->dmaops->add_rx_desc(priv, &priv->rx_ring[i]);
+
+       spin_unlock_irqrestore(&priv->rxdma_irq_lock, flags);
+
+       /* Start MAC Rx/Tx */
+       spin_lock(&priv->mac_cfg_lock);
+       tse_set_mac(priv, true);
+       spin_unlock(&priv->mac_cfg_lock);
+
+       if (priv->phydev)
+               phy_start(priv->phydev);
+
+       napi_enable(&priv->napi);
+       netif_start_queue(dev);
+
+       return 0;
+
+tx_request_irq_error:
+       free_irq(priv->rx_irq, dev);
+init_error:
+       free_skbufs(dev);
+alloc_skbuf_error:
+       if (priv->phydev) {
+               phy_disconnect(priv->phydev);
+               priv->phydev = NULL;
+       }
+phy_error:
+       return ret;
+}
+
+/* Stop TSE MAC interface and put the device in an inactive state
+ */
+static int tse_shutdown(struct net_device *dev)
+{
+       struct altera_tse_private *priv = netdev_priv(dev);
+       int ret;
+       unsigned long int flags;
+
+       /* Stop and disconnect the PHY */
+       if (priv->phydev) {
+               phy_stop(priv->phydev);
+               phy_disconnect(priv->phydev);
+               priv->phydev = NULL;
+       }
+
+       netif_stop_queue(dev);
+       napi_disable(&priv->napi);
+
+       /* Disable DMA interrupts */
+       spin_lock_irqsave(&priv->rxdma_irq_lock, flags);
+       priv->dmaops->disable_rxirq(priv);
+       priv->dmaops->disable_txirq(priv);
+       spin_unlock_irqrestore(&priv->rxdma_irq_lock, flags);
+
+       /* Free the IRQ lines */
+       free_irq(priv->rx_irq, dev);
+       free_irq(priv->tx_irq, dev);
+
+       /* disable and reset the MAC, empties fifo */
+       spin_lock(&priv->mac_cfg_lock);
+       spin_lock(&priv->tx_lock);
+
+       ret = reset_mac(priv);
+       if (ret)
+               netdev_err(dev, "Cannot reset MAC core (error: %d)\n", ret);
+       priv->dmaops->reset_dma(priv);
+       free_skbufs(dev);
+
+       spin_unlock(&priv->tx_lock);
+       spin_unlock(&priv->mac_cfg_lock);
+
+       priv->dmaops->uninit_dma(priv);
+
+       return 0;
+}
+
+static struct net_device_ops altera_tse_netdev_ops = {
+       .ndo_open               = tse_open,
+       .ndo_stop               = tse_shutdown,
+       .ndo_start_xmit         = tse_start_xmit,
+       .ndo_set_mac_address    = eth_mac_addr,
+       .ndo_set_rx_mode        = tse_set_rx_mode,
+       .ndo_change_mtu         = tse_change_mtu,
+       .ndo_validate_addr      = eth_validate_addr,
+};
+
+
+static int request_and_map(struct platform_device *pdev, const char *name,
+                          struct resource **res, void __iomem **ptr)
+{
+       struct resource *region;
+       struct device *device = &pdev->dev;
+
+       *res = platform_get_resource_byname(pdev, IORESOURCE_MEM, name);
+       if (*res == NULL) {
+               dev_err(device, "resource %s not defined\n", name);
+               return -ENODEV;
+       }
+
+       region = devm_request_mem_region(device, (*res)->start,
+                                        resource_size(*res), dev_name(device));
+       if (region == NULL) {
+               dev_err(device, "unable to request %s\n", name);
+               return -EBUSY;
+       }
+
+       *ptr = devm_ioremap_nocache(device, region->start,
+                                   resource_size(region));
+       if (*ptr == NULL) {
+               dev_err(device, "ioremap_nocache of %s failed!", name);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+/* Probe Altera TSE MAC device
+ */
+static int altera_tse_probe(struct platform_device *pdev)
+{
+       struct net_device *ndev;
+       int ret = -ENODEV;
+       struct resource *control_port;
+       struct resource *dma_res;
+       struct altera_tse_private *priv;
+       const unsigned char *macaddr;
+       struct device_node *np = pdev->dev.of_node;
+       void __iomem *descmap;
+       const struct of_device_id *of_id = NULL;
+
+       ndev = alloc_etherdev(sizeof(struct altera_tse_private));
+       if (!ndev) {
+               dev_err(&pdev->dev, "Could not allocate network device\n");
+               return -ENODEV;
+       }
+
+       SET_NETDEV_DEV(ndev, &pdev->dev);
+
+       priv = netdev_priv(ndev);
+       priv->device = &pdev->dev;
+       priv->dev = ndev;
+       priv->msg_enable = netif_msg_init(debug, default_msg_level);
+
+       of_id = of_match_device(altera_tse_ids, &pdev->dev);
+
+       if (of_id)
+               priv->dmaops = (struct altera_dmaops *)of_id->data;
+
+
+       if (priv->dmaops &&
+           priv->dmaops->altera_dtype == ALTERA_DTYPE_SGDMA) {
+               /* Get the mapped address to the SGDMA descriptor memory */
+               ret = request_and_map(pdev, "s1", &dma_res, &descmap);
+               if (ret)
+                       goto out_free;
+
+               /* Start of that memory is for transmit descriptors */
+               priv->tx_dma_desc = descmap;
+
+               /* First half is for tx descriptors, other half for tx */
+               priv->txdescmem = resource_size(dma_res)/2;
+
+               priv->txdescmem_busaddr = (dma_addr_t)dma_res->start;
+
+               priv->rx_dma_desc = (void __iomem *)((uintptr_t)(descmap +
+                                                    priv->txdescmem));
+               priv->rxdescmem = resource_size(dma_res)/2;
+               priv->rxdescmem_busaddr = dma_res->start;
+               priv->rxdescmem_busaddr += priv->txdescmem;
+
+               if (upper_32_bits(priv->rxdescmem_busaddr)) {
+                       dev_dbg(priv->device,
+                               "SGDMA bus addresses greater than 32-bits\n");
+                       goto out_free;
+               }
+               if (upper_32_bits(priv->txdescmem_busaddr)) {
+                       dev_dbg(priv->device,
+                               "SGDMA bus addresses greater than 32-bits\n");
+                       goto out_free;
+               }
+       } else if (priv->dmaops &&
+                  priv->dmaops->altera_dtype == ALTERA_DTYPE_MSGDMA) {
+               ret = request_and_map(pdev, "rx_resp", &dma_res,
+                                     &priv->rx_dma_resp);
+               if (ret)
+                       goto out_free;
+
+               ret = request_and_map(pdev, "tx_desc", &dma_res,
+                                     &priv->tx_dma_desc);
+               if (ret)
+                       goto out_free;
+
+               priv->txdescmem = resource_size(dma_res);
+               priv->txdescmem_busaddr = dma_res->start;
+
+               ret = request_and_map(pdev, "rx_desc", &dma_res,
+                                     &priv->rx_dma_desc);
+               if (ret)
+                       goto out_free;
+
+               priv->rxdescmem = resource_size(dma_res);
+               priv->rxdescmem_busaddr = dma_res->start;
+
+       } else {
+               goto out_free;
+       }
+
+       if (!dma_set_mask(priv->device, DMA_BIT_MASK(priv->dmaops->dmamask)))
+               dma_set_coherent_mask(priv->device,
+                                     DMA_BIT_MASK(priv->dmaops->dmamask));
+       else if (!dma_set_mask(priv->device, DMA_BIT_MASK(32)))
+               dma_set_coherent_mask(priv->device, DMA_BIT_MASK(32));
+       else
+               goto out_free;
+
+       /* MAC address space */
+       ret = request_and_map(pdev, "control_port", &control_port,
+                             (void __iomem **)&priv->mac_dev);
+       if (ret)
+               goto out_free;
+
+       /* xSGDMA Rx Dispatcher address space */
+       ret = request_and_map(pdev, "rx_csr", &dma_res,
+                             &priv->rx_dma_csr);
+       if (ret)
+               goto out_free;
+
+
+       /* xSGDMA Tx Dispatcher address space */
+       ret = request_and_map(pdev, "tx_csr", &dma_res,
+                             &priv->tx_dma_csr);
+       if (ret)
+               goto out_free;
+
+
+       /* Rx IRQ */
+       priv->rx_irq = platform_get_irq_byname(pdev, "rx_irq");
+       if (priv->rx_irq == -ENXIO) {
+               dev_err(&pdev->dev, "cannot obtain Rx IRQ\n");
+               ret = -ENXIO;
+               goto out_free;
+       }
+
+       /* Tx IRQ */
+       priv->tx_irq = platform_get_irq_byname(pdev, "tx_irq");
+       if (priv->tx_irq == -ENXIO) {
+               dev_err(&pdev->dev, "cannot obtain Tx IRQ\n");
+               ret = -ENXIO;
+               goto out_free;
+       }
+
+       /* get FIFO depths from device tree */
+       if (of_property_read_u32(pdev->dev.of_node, "rx-fifo-depth",
+                                &priv->rx_fifo_depth)) {
+               dev_err(&pdev->dev, "cannot obtain rx-fifo-depth\n");
+               ret = -ENXIO;
+               goto out_free;
+       }
+
+       if (of_property_read_u32(pdev->dev.of_node, "tx-fifo-depth",
+                                &priv->rx_fifo_depth)) {
+               dev_err(&pdev->dev, "cannot obtain tx-fifo-depth\n");
+               ret = -ENXIO;
+               goto out_free;
+       }
+
+       /* get hash filter settings for this instance */
+       priv->hash_filter =
+               of_property_read_bool(pdev->dev.of_node,
+                                     "altr,has-hash-multicast-filter");
+
+       /* get supplemental address settings for this instance */
+       priv->added_unicast =
+               of_property_read_bool(pdev->dev.of_node,
+                                     "altr,has-supplementary-unicast");
+
+       /* Max MTU is 1500, ETH_DATA_LEN */
+       priv->max_mtu = ETH_DATA_LEN;
+
+       /* Get the max mtu from the device tree. Note that the
+        * "max-frame-size" parameter is actually max mtu. Definition
+        * in the ePAPR v1.1 spec and usage differ, so go with usage.
+        */
+       of_property_read_u32(pdev->dev.of_node, "max-frame-size",
+                            &priv->max_mtu);
+
+       /* The DMA buffer size already accounts for an alignment bias
+        * to avoid unaligned access exceptions for the NIOS processor,
+        */
+       priv->rx_dma_buf_sz = ALTERA_RXDMABUFFER_SIZE;
+
+       /* get default MAC address from device tree */
+       macaddr = of_get_mac_address(pdev->dev.of_node);
+       if (macaddr)
+               ether_addr_copy(ndev->dev_addr, macaddr);
+       else
+               eth_hw_addr_random(ndev);
+
+       priv->phy_iface = of_get_phy_mode(np);
+
+       /* try to get PHY address from device tree, use PHY autodetection if
+        * no valid address is given
+        */
+       if (of_property_read_u32(pdev->dev.of_node, "phy-addr",
+                                &priv->phy_addr)) {
+               priv->phy_addr = POLL_PHY;
+       }
+
+       if (!((priv->phy_addr == POLL_PHY) ||
+             ((priv->phy_addr >= 0) && (priv->phy_addr < PHY_MAX_ADDR)))) {
+               dev_err(&pdev->dev, "invalid phy-addr specified %d\n",
+                       priv->phy_addr);
+               goto out_free;
+       }
+
+       /* Create/attach to MDIO bus */
+       ret = altera_tse_mdio_create(ndev,
+                                    atomic_add_return(1, &instance_count));
+
+       if (ret)
+               goto out_free;
+
+       /* initialize netdev */
+       ether_setup(ndev);
+       ndev->mem_start = control_port->start;
+       ndev->mem_end = control_port->end;
+       ndev->netdev_ops = &altera_tse_netdev_ops;
+       altera_tse_set_ethtool_ops(ndev);
+
+       altera_tse_netdev_ops.ndo_set_rx_mode = tse_set_rx_mode;
+
+       if (priv->hash_filter)
+               altera_tse_netdev_ops.ndo_set_rx_mode =
+                       tse_set_rx_mode_hashfilter;
+
+       /* Scatter/gather IO is not supported,
+        * so it is turned off
+        */
+       ndev->hw_features &= ~NETIF_F_SG;
+       ndev->features |= ndev->hw_features | NETIF_F_HIGHDMA;
+
+       /* VLAN offloading of tagging, stripping and filtering is not
+        * supported by hardware, but driver will accommodate the
+        * extra 4-byte VLAN tag for processing by upper layers
+        */
+       ndev->features |= NETIF_F_HW_VLAN_CTAG_RX;
+
+       /* setup NAPI interface */
+       netif_napi_add(ndev, &priv->napi, tse_poll, NAPI_POLL_WEIGHT);
+
+       spin_lock_init(&priv->mac_cfg_lock);
+       spin_lock_init(&priv->tx_lock);
+       spin_lock_init(&priv->rxdma_irq_lock);
+
+       ret = register_netdev(ndev);
+       if (ret) {
+               dev_err(&pdev->dev, "failed to register TSE net device\n");
+               goto out_free_mdio;
+       }
+
+       platform_set_drvdata(pdev, ndev);
+
+       priv->revision = ioread32(&priv->mac_dev->megacore_revision);
+
+       if (netif_msg_probe(priv))
+               dev_info(&pdev->dev, "Altera TSE MAC version %d.%d at 0x%08lx irq %d/%d\n",
+                        (priv->revision >> 8) & 0xff,
+                        priv->revision & 0xff,
+                        (unsigned long) control_port->start, priv->rx_irq,
+                        priv->tx_irq);
+
+       ret = init_phy(ndev);
+       if (ret != 0) {
+               netdev_err(ndev, "Cannot attach to PHY (error: %d)\n", ret);
+               goto out_free_mdio;
+       }
+       return 0;
+
+out_free_mdio:
+       altera_tse_mdio_destroy(ndev);
+out_free:
+       free_netdev(ndev);
+       return ret;
+}
+
+/* Remove Altera TSE MAC device
+ */
+static int altera_tse_remove(struct platform_device *pdev)
+{
+       struct net_device *ndev = platform_get_drvdata(pdev);
+
+       platform_set_drvdata(pdev, NULL);
+       altera_tse_mdio_destroy(ndev);
+       unregister_netdev(ndev);
+       free_netdev(ndev);
+
+       return 0;
+}
+
+struct altera_dmaops altera_dtype_sgdma = {
+       .altera_dtype = ALTERA_DTYPE_SGDMA,
+       .dmamask = 32,
+       .reset_dma = sgdma_reset,
+       .enable_txirq = sgdma_enable_txirq,
+       .enable_rxirq = sgdma_enable_rxirq,
+       .disable_txirq = sgdma_disable_txirq,
+       .disable_rxirq = sgdma_disable_rxirq,
+       .clear_txirq = sgdma_clear_txirq,
+       .clear_rxirq = sgdma_clear_rxirq,
+       .tx_buffer = sgdma_tx_buffer,
+       .tx_completions = sgdma_tx_completions,
+       .add_rx_desc = sgdma_add_rx_desc,
+       .get_rx_status = sgdma_rx_status,
+       .init_dma = sgdma_initialize,
+       .uninit_dma = sgdma_uninitialize,
+};
+
+struct altera_dmaops altera_dtype_msgdma = {
+       .altera_dtype = ALTERA_DTYPE_MSGDMA,
+       .dmamask = 64,
+       .reset_dma = msgdma_reset,
+       .enable_txirq = msgdma_enable_txirq,
+       .enable_rxirq = msgdma_enable_rxirq,
+       .disable_txirq = msgdma_disable_txirq,
+       .disable_rxirq = msgdma_disable_rxirq,
+       .clear_txirq = msgdma_clear_txirq,
+       .clear_rxirq = msgdma_clear_rxirq,
+       .tx_buffer = msgdma_tx_buffer,
+       .tx_completions = msgdma_tx_completions,
+       .add_rx_desc = msgdma_add_rx_desc,
+       .get_rx_status = msgdma_rx_status,
+       .init_dma = msgdma_initialize,
+       .uninit_dma = msgdma_uninitialize,
+};
+
+static struct of_device_id altera_tse_ids[] = {
+       { .compatible = "altr,tse-msgdma-1.0", .data = &altera_dtype_msgdma, },
+       { .compatible = "altr,tse-1.0", .data = &altera_dtype_sgdma, },
+       { .compatible = "ALTR,tse-1.0", .data = &altera_dtype_sgdma, },
+       {},
+};
+MODULE_DEVICE_TABLE(of, altera_tse_ids);
+
+static struct platform_driver altera_tse_driver = {
+       .probe          = altera_tse_probe,
+       .remove         = altera_tse_remove,
+       .suspend        = NULL,
+       .resume         = NULL,
+       .driver         = {
+               .name   = ALTERA_TSE_RESOURCE_NAME,
+               .owner  = THIS_MODULE,
+               .of_match_table = altera_tse_ids,
+       },
+};
+
+module_platform_driver(altera_tse_driver);
+
+MODULE_AUTHOR("Altera Corporation");
+MODULE_DESCRIPTION("Altera Triple Speed Ethernet MAC driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/net/ethernet/altera/altera_utils.c b/drivers/net/ethernet/altera/altera_utils.c
new file mode 100644 (file)
index 0000000..70fa13f
--- /dev/null
@@ -0,0 +1,44 @@
+/* Altera TSE SGDMA and MSGDMA Linux driver
+ * Copyright (C) 2014 Altera Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "altera_tse.h"
+#include "altera_utils.h"
+
+void tse_set_bit(void __iomem *ioaddr, u32 bit_mask)
+{
+       u32 value = ioread32(ioaddr);
+       value |= bit_mask;
+       iowrite32(value, ioaddr);
+}
+
+void tse_clear_bit(void __iomem *ioaddr, u32 bit_mask)
+{
+       u32 value = ioread32(ioaddr);
+       value &= ~bit_mask;
+       iowrite32(value, ioaddr);
+}
+
+int tse_bit_is_set(void __iomem *ioaddr, u32 bit_mask)
+{
+       u32 value = ioread32(ioaddr);
+       return (value & bit_mask) ? 1 : 0;
+}
+
+int tse_bit_is_clear(void __iomem *ioaddr, u32 bit_mask)
+{
+       u32 value = ioread32(ioaddr);
+       return (value & bit_mask) ? 0 : 1;
+}
diff --git a/drivers/net/ethernet/altera/altera_utils.h b/drivers/net/ethernet/altera/altera_utils.h
new file mode 100644 (file)
index 0000000..ce1db36
--- /dev/null
@@ -0,0 +1,27 @@
+/* Altera TSE SGDMA and MSGDMA Linux driver
+ * Copyright (C) 2014 Altera Corporation. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+
+#ifndef __ALTERA_UTILS_H__
+#define __ALTERA_UTILS_H__
+
+void tse_set_bit(void __iomem *ioaddr, u32 bit_mask);
+void tse_clear_bit(void __iomem *ioaddr, u32 bit_mask);
+int tse_bit_is_set(void __iomem *ioaddr, u32 bit_mask);
+int tse_bit_is_clear(void __iomem *ioaddr, u32 bit_mask);
+
+#endif /* __ALTERA_UTILS_H__*/
index 3f044e736de8d6b7b809a0d4cf8be1780540388f..eafad410e59a798fb127f6026a5bcd1722081c48 100644 (file)
@@ -7076,12 +7076,14 @@ static DEFINE_PCI_DEVICE_TABLE(e1000_pci_tbl) = {
 MODULE_DEVICE_TABLE(pci, e1000_pci_tbl);
 
 static const struct dev_pm_ops e1000_pm_ops = {
+#ifdef CONFIG_PM_SLEEP
        .suspend        = e1000e_pm_suspend,
        .resume         = e1000e_pm_resume,
        .freeze         = e1000e_pm_freeze,
        .thaw           = e1000e_pm_thaw,
        .poweroff       = e1000e_pm_suspend,
        .restore        = e1000e_pm_resume,
+#endif
        SET_RUNTIME_PM_OPS(e1000e_pm_runtime_suspend, e1000e_pm_runtime_resume,
                           e1000e_pm_runtime_idle)
 };
index c66a11e31e6f3774f795ae91b56089881928cea3..3daaf205eabc034492412b498fcb25a297145bce 100644 (file)
@@ -6578,10 +6578,9 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
        np = netdev_priv(netdev);
        np->vsi = vsi;
 
-       netdev->hw_enc_features = NETIF_F_IP_CSUM        |
+       netdev->hw_enc_features |= NETIF_F_IP_CSUM       |
                                  NETIF_F_GSO_UDP_TUNNEL |
-                                 NETIF_F_TSO            |
-                                 NETIF_F_SG;
+                                 NETIF_F_TSO;
 
        netdev->features = NETIF_F_SG                  |
                           NETIF_F_IP_CSUM             |
index 78618af271cf1b76d35e59b3000602ba86bac504..c688a0fc5c2965a2ddf42d935b1fabd6902ace23 100644 (file)
@@ -160,7 +160,7 @@ i40e_status i40evf_aq_queue_shutdown(struct i40e_hw *hw,
 }
 
 
-/* The i40e_ptype_lookup table is used to convert from the 8-bit ptype in the
+/* The i40evf_ptype_lookup table is used to convert from the 8-bit ptype in the
  * hardware to a bit-field that can be used by SW to more easily determine the
  * packet type.
  *
@@ -173,10 +173,10 @@ i40e_status i40evf_aq_queue_shutdown(struct i40e_hw *hw,
  *
  * Typical work flow:
  *
- * IF NOT i40e_ptype_lookup[ptype].known
+ * IF NOT i40evf_ptype_lookup[ptype].known
  * THEN
  *      Packet is unknown
- * ELSE IF i40e_ptype_lookup[ptype].outer_ip == I40E_RX_PTYPE_OUTER_IP
+ * ELSE IF i40evf_ptype_lookup[ptype].outer_ip == I40E_RX_PTYPE_OUTER_IP
  *      Use the rest of the fields to look at the tunnels, inner protocols, etc
  * ELSE
  *      Use the enum i40e_rx_l2_ptype to decode the packet type
@@ -205,7 +205,7 @@ i40e_status i40evf_aq_queue_shutdown(struct i40e_hw *hw,
 #define I40E_RX_PTYPE_INNER_PROT_TS    I40E_RX_PTYPE_INNER_PROT_TIMESYNC
 
 /* Lookup table mapping the HW PTYPE to the bit field for decoding */
-struct i40e_rx_ptype_decoded i40e_ptype_lookup[] = {
+struct i40e_rx_ptype_decoded i40evf_ptype_lookup[] = {
        /* L2 Packet types */
        I40E_PTT_UNUSED_ENTRY(0),
        I40E_PTT(1,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
index 33c99051cc968d772d6512a10a0804a67c5383b9..862fcdf52675dabb0ea7372f971c49490db29c1a 100644 (file)
@@ -63,11 +63,11 @@ i40e_status i40evf_aq_queue_shutdown(struct i40e_hw *hw,
 
 i40e_status i40e_set_mac_type(struct i40e_hw *hw);
 
-extern struct i40e_rx_ptype_decoded i40e_ptype_lookup[];
+extern struct i40e_rx_ptype_decoded i40evf_ptype_lookup[];
 
 static inline struct i40e_rx_ptype_decoded decode_rx_desc_ptype(u8 ptype)
 {
-       return  i40e_ptype_lookup[ptype];
+       return  i40evf_ptype_lookup[ptype];
 }
 
 /* prototype for functions used for SW locks */
index d6b11522fed7a3096a365cf97795234b9eccf2d8..ac06492fb8163c950d476c0980b108f990b08ee4 100644 (file)
@@ -4347,8 +4347,7 @@ enum latency_range {
  *  were determined based on theoretical maximum wire speed and testing
  *  data, in order to minimize response time while increasing bulk
  *  throughput.
- *  This functionality is controlled by the InterruptThrottleRate module
- *  parameter (see igb_param.c)
+ *  This functionality is controlled by ethtool's coalescing settings.
  *  NOTE:  This function is called only when operating in a multiqueue
  *         receive environment.
  **/
@@ -4422,8 +4421,7 @@ clear_counts:
  *  based on theoretical maximum wire speed and thresholds were set based
  *  on testing data as well as attempting to minimize response time
  *  while increasing bulk throughput.
- *  this functionality is controlled by the InterruptThrottleRate module
- *  parameter (see igb_param.c)
+ *  This functionality is controlled by ethtool's coalescing settings.
  *  NOTE:  These calculations are only valid when operating in a single-
  *         queue environment.
  **/
index 236a4414173a7c4c3be4642990d2c3d84f9310f6..efaca6d5e85b35b3e3144a1105b12051eca8d8c4 100644 (file)
@@ -400,7 +400,8 @@ static void sh_eth_select_mii(struct net_device *ndev)
                value = 0x0;
                break;
        default:
-               pr_warn("PHY interface mode was not setup. Set to MII.\n");
+               netdev_warn(ndev,
+                           "PHY interface mode was not setup. Set to MII.\n");
                value = 0x1;
                break;
        }
@@ -854,7 +855,7 @@ static int sh_eth_check_reset(struct net_device *ndev)
                cnt--;
        }
        if (cnt <= 0) {
-               pr_err("Device reset failed\n");
+               netdev_err(ndev, "Device reset failed\n");
                ret = -ETIMEDOUT;
        }
        return ret;
@@ -1556,8 +1557,7 @@ ignore_link:
                /* Unused write back interrupt */
                if (intr_status & EESR_TABT) {  /* Transmit Abort int */
                        ndev->stats.tx_aborted_errors++;
-                       if (netif_msg_tx_err(mdp))
-                               dev_err(&ndev->dev, "Transmit Abort\n");
+                       netif_err(mdp, tx_err, ndev, "Transmit Abort\n");
                }
        }
 
@@ -1566,45 +1566,38 @@ ignore_link:
                if (intr_status & EESR_RFRMER) {
                        /* Receive Frame Overflow int */
                        ndev->stats.rx_frame_errors++;
-                       if (netif_msg_rx_err(mdp))
-                               dev_err(&ndev->dev, "Receive Abort\n");
+                       netif_err(mdp, rx_err, ndev, "Receive Abort\n");
                }
        }
 
        if (intr_status & EESR_TDE) {
                /* Transmit Descriptor Empty int */
                ndev->stats.tx_fifo_errors++;
-               if (netif_msg_tx_err(mdp))
-                       dev_err(&ndev->dev, "Transmit Descriptor Empty\n");
+               netif_err(mdp, tx_err, ndev, "Transmit Descriptor Empty\n");
        }
 
        if (intr_status & EESR_TFE) {
                /* FIFO under flow */
                ndev->stats.tx_fifo_errors++;
-               if (netif_msg_tx_err(mdp))
-                       dev_err(&ndev->dev, "Transmit FIFO Under flow\n");
+               netif_err(mdp, tx_err, ndev, "Transmit FIFO Under flow\n");
        }
 
        if (intr_status & EESR_RDE) {
                /* Receive Descriptor Empty int */
                ndev->stats.rx_over_errors++;
-
-               if (netif_msg_rx_err(mdp))
-                       dev_err(&ndev->dev, "Receive Descriptor Empty\n");
+               netif_err(mdp, rx_err, ndev, "Receive Descriptor Empty\n");
        }
 
        if (intr_status & EESR_RFE) {
                /* Receive FIFO Overflow int */
                ndev->stats.rx_fifo_errors++;
-               if (netif_msg_rx_err(mdp))
-                       dev_err(&ndev->dev, "Receive FIFO Overflow\n");
+               netif_err(mdp, rx_err, ndev, "Receive FIFO Overflow\n");
        }
 
        if (!mdp->cd->no_ade && (intr_status & EESR_ADE)) {
                /* Address Error */
                ndev->stats.tx_fifo_errors++;
-               if (netif_msg_tx_err(mdp))
-                       dev_err(&ndev->dev, "Address Error\n");
+               netif_err(mdp, tx_err, ndev, "Address Error\n");
        }
 
        mask = EESR_TWB | EESR_TABT | EESR_ADE | EESR_TDE | EESR_TFE;
@@ -1615,9 +1608,9 @@ ignore_link:
                u32 edtrr = sh_eth_read(ndev, EDTRR);
 
                /* dmesg */
-               dev_err(&ndev->dev, "TX error. status=%8.8x cur_tx=%8.8x dirty_tx=%8.8x state=%8.8x EDTRR=%8.8x.\n",
-                       intr_status, mdp->cur_tx, mdp->dirty_tx,
-                       (u32)ndev->state, edtrr);
+               netdev_err(ndev, "TX error. status=%8.8x cur_tx=%8.8x dirty_tx=%8.8x state=%8.8x EDTRR=%8.8x.\n",
+                          intr_status, mdp->cur_tx, mdp->dirty_tx,
+                          (u32)ndev->state, edtrr);
                /* dirty buffer free */
                sh_eth_txfree(ndev);
 
@@ -1662,9 +1655,9 @@ static irqreturn_t sh_eth_interrupt(int irq, void *netdev)
                                     EESIPR);
                        __napi_schedule(&mdp->napi);
                } else {
-                       dev_warn(&ndev->dev,
-                                "ignoring interrupt, status 0x%08lx, mask 0x%08lx.\n",
-                                intr_status, intr_enable);
+                       netdev_warn(ndev,
+                                   "ignoring interrupt, status 0x%08lx, mask 0x%08lx.\n",
+                                   intr_status, intr_enable);
                }
        }
 
@@ -1793,12 +1786,12 @@ static int sh_eth_phy_init(struct net_device *ndev)
        }
 
        if (IS_ERR(phydev)) {
-               dev_err(&ndev->dev, "failed to connect PHY\n");
+               netdev_err(ndev, "failed to connect PHY\n");
                return PTR_ERR(phydev);
        }
 
-       dev_info(&ndev->dev, "attached PHY %d (IRQ %d) to driver %s\n",
-                phydev->addr, phydev->irq, phydev->drv->name);
+       netdev_info(ndev, "attached PHY %d (IRQ %d) to driver %s\n",
+                   phydev->addr, phydev->irq, phydev->drv->name);
 
        mdp->phydev = phydev;
 
@@ -1979,12 +1972,12 @@ static int sh_eth_set_ringparam(struct net_device *ndev,
 
        ret = sh_eth_ring_init(ndev);
        if (ret < 0) {
-               dev_err(&ndev->dev, "%s: sh_eth_ring_init failed.\n", __func__);
+               netdev_err(ndev, "%s: sh_eth_ring_init failed.\n", __func__);
                return ret;
        }
        ret = sh_eth_dev_init(ndev, false);
        if (ret < 0) {
-               dev_err(&ndev->dev, "%s: sh_eth_dev_init failed.\n", __func__);
+               netdev_err(ndev, "%s: sh_eth_dev_init failed.\n", __func__);
                return ret;
        }
 
@@ -2025,7 +2018,7 @@ static int sh_eth_open(struct net_device *ndev)
        ret = request_irq(ndev->irq, sh_eth_interrupt,
                          mdp->cd->irq_flags, ndev->name, ndev);
        if (ret) {
-               dev_err(&ndev->dev, "Can not assign IRQ number\n");
+               netdev_err(ndev, "Can not assign IRQ number\n");
                goto out_napi_off;
        }
 
@@ -2063,10 +2056,9 @@ static void sh_eth_tx_timeout(struct net_device *ndev)
 
        netif_stop_queue(ndev);
 
-       if (netif_msg_timer(mdp)) {
-               dev_err(&ndev->dev, "%s: transmit timed out, status %8.8x, resetting...\n",
-                       ndev->name, (int)sh_eth_read(ndev, EESR));
-       }
+       netif_err(mdp, timer, ndev,
+                 "transmit timed out, status %8.8x, resetting...\n",
+                 (int)sh_eth_read(ndev, EESR));
 
        /* tx_errors count up */
        ndev->stats.tx_errors++;
@@ -2101,8 +2093,7 @@ static int sh_eth_start_xmit(struct sk_buff *skb, struct net_device *ndev)
        spin_lock_irqsave(&mdp->lock, flags);
        if ((mdp->cur_tx - mdp->dirty_tx) >= (mdp->num_tx_ring - 4)) {
                if (!sh_eth_txfree(ndev)) {
-                       if (netif_msg_tx_queued(mdp))
-                               dev_warn(&ndev->dev, "TxFD exhausted.\n");
+                       netif_warn(mdp, tx_queued, ndev, "TxFD exhausted.\n");
                        netif_stop_queue(ndev);
                        spin_unlock_irqrestore(&mdp->lock, flags);
                        return NETDEV_TX_BUSY;
@@ -2272,7 +2263,7 @@ static int sh_eth_tsu_busy(struct net_device *ndev)
                udelay(10);
                timeout--;
                if (timeout <= 0) {
-                       dev_err(&ndev->dev, "%s: timeout\n", __func__);
+                       netdev_err(ndev, "%s: timeout\n", __func__);
                        return -ETIMEDOUT;
                }
        }
@@ -2703,7 +2694,6 @@ static const u16 *sh_eth_get_register_offset(int register_type)
                reg_offset = sh_eth_offset_fast_sh3_sh2;
                break;
        default:
-               pr_err("Unknown register type (%d)\n", register_type);
                break;
        }
 
@@ -2859,6 +2849,12 @@ static int sh_eth_drv_probe(struct platform_device *pdev)
                mdp->cd = (struct sh_eth_cpu_data *)match->data;
        }
        mdp->reg_offset = sh_eth_get_register_offset(mdp->cd->register_type);
+       if (!mdp->reg_offset) {
+               dev_err(&pdev->dev, "Unknown register type (%d)\n",
+                       mdp->cd->register_type);
+               ret = -EINVAL;
+               goto out_release;
+       }
        sh_eth_set_default_cpu_data(mdp->cd);
 
        /* set function */
@@ -2919,8 +2915,8 @@ static int sh_eth_drv_probe(struct platform_device *pdev)
        }
 
        /* print device information */
-       pr_info("Base address at 0x%x, %pM, IRQ %d.\n",
-               (u32)ndev->base_addr, ndev->dev_addr, ndev->irq);
+       netdev_info(ndev, "Base address at 0x%x, %pM, IRQ %d.\n",
+                   (u32)ndev->base_addr, ndev->dev_addr, ndev->irq);
 
        platform_set_drvdata(pdev, ndev);
 
index 5bc1a2d02dc125ebf78ba35a40012baa3f0715b9..9d93fa1205788e371a67cae6a72e3e9412a661c4 100644 (file)
@@ -1022,7 +1022,7 @@ static int rhine_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
        /* The chip-specific entries in the device structure. */
        dev->netdev_ops = &rhine_netdev_ops;
-       dev->ethtool_ops = &netdev_ethtool_ops,
+       dev->ethtool_ops = &netdev_ethtool_ops;
        dev->watchdog_timeo = TX_TIMEOUT;
 
        netif_napi_add(dev, &rp->napi, rhine_napipoll, 64);
index ae38a98d072e1c32763723fdb6c5b3a14ff10561..e8004ef73bc1c6a900f57b741d1780391cad769f 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/spi/spi.h>
 #include <linux/spi/at86rf230.h>
 #include <linux/skbuff.h>
+#include <linux/of_gpio.h>
 
 #include <net/mac802154.h>
 #include <net/wpan-phy.h>
@@ -574,7 +575,7 @@ at86rf230_start(struct ieee802154_dev *dev)
        if (rc)
                return rc;
 
-       rc = at86rf230_state(dev, STATE_FORCE_TX_ON);
+       rc = at86rf230_state(dev, STATE_TX_ON);
        if (rc)
                return rc;
 
@@ -914,8 +915,8 @@ static void at86rf230_irqwork(struct work_struct *work)
        status &= ~IRQ_TRX_UR; /* FIXME: possibly handle ???*/
 
        if (status & IRQ_TRX_END) {
-               spin_lock_irqsave(&lp->lock, flags);
                status &= ~IRQ_TRX_END;
+               spin_lock_irqsave(&lp->lock, flags);
                if (lp->is_tx) {
                        lp->is_tx = 0;
                        spin_unlock_irqrestore(&lp->lock, flags);
@@ -1035,6 +1036,40 @@ static int at86rf230_hw_init(struct at86rf230_local *lp)
        return 0;
 }
 
+static struct at86rf230_platform_data *
+at86rf230_get_pdata(struct spi_device *spi)
+{
+       struct at86rf230_platform_data *pdata;
+       const char *irq_type;
+
+       if (!IS_ENABLED(CONFIG_OF) || !spi->dev.of_node)
+               return spi->dev.platform_data;
+
+       pdata = devm_kzalloc(&spi->dev, sizeof(*pdata), GFP_KERNEL);
+       if (!pdata)
+               goto done;
+
+       pdata->rstn = of_get_named_gpio(spi->dev.of_node, "reset-gpio", 0);
+       pdata->slp_tr = of_get_named_gpio(spi->dev.of_node, "sleep-gpio", 0);
+
+       pdata->irq_type = IRQF_TRIGGER_RISING;
+       of_property_read_string(spi->dev.of_node, "irq-type", &irq_type);
+       if (!strcmp(irq_type, "level-high"))
+               pdata->irq_type = IRQF_TRIGGER_HIGH;
+       else if (!strcmp(irq_type, "level-low"))
+               pdata->irq_type = IRQF_TRIGGER_LOW;
+       else if (!strcmp(irq_type, "edge-rising"))
+               pdata->irq_type = IRQF_TRIGGER_RISING;
+       else if (!strcmp(irq_type, "edge-falling"))
+               pdata->irq_type = IRQF_TRIGGER_FALLING;
+       else
+               dev_warn(&spi->dev, "wrong irq-type specified using edge-rising\n");
+
+       spi->dev.platform_data = pdata;
+done:
+       return pdata;
+}
+
 static int at86rf230_probe(struct spi_device *spi)
 {
        struct at86rf230_platform_data *pdata;
@@ -1053,15 +1088,17 @@ static int at86rf230_probe(struct spi_device *spi)
                return -EINVAL;
        }
 
-       pdata = spi->dev.platform_data;
+       pdata = at86rf230_get_pdata(spi);
        if (!pdata) {
                dev_err(&spi->dev, "no platform_data\n");
                return -EINVAL;
        }
 
-       rc = gpio_request(pdata->rstn, "rstn");
-       if (rc)
-               return rc;
+       if (gpio_is_valid(pdata->rstn)) {
+               rc = gpio_request(pdata->rstn, "rstn");
+               if (rc)
+                       return rc;
+       }
 
        if (gpio_is_valid(pdata->slp_tr)) {
                rc = gpio_request(pdata->slp_tr, "slp_tr");
@@ -1069,9 +1106,11 @@ static int at86rf230_probe(struct spi_device *spi)
                        goto err_slp_tr;
        }
 
-       rc = gpio_direction_output(pdata->rstn, 1);
-       if (rc)
-               goto err_gpio_dir;
+       if (gpio_is_valid(pdata->rstn)) {
+               rc = gpio_direction_output(pdata->rstn, 1);
+               if (rc)
+                       goto err_gpio_dir;
+       }
 
        if (gpio_is_valid(pdata->slp_tr)) {
                rc = gpio_direction_output(pdata->slp_tr, 0);
@@ -1080,11 +1119,13 @@ static int at86rf230_probe(struct spi_device *spi)
        }
 
        /* Reset */
-       msleep(1);
-       gpio_set_value(pdata->rstn, 0);
-       msleep(1);
-       gpio_set_value(pdata->rstn, 1);
-       msleep(1);
+       if (gpio_is_valid(pdata->rstn)) {
+               udelay(1);
+               gpio_set_value(pdata->rstn, 0);
+               udelay(1);
+               gpio_set_value(pdata->rstn, 1);
+               usleep_range(120, 240);
+       }
 
        rc = __at86rf230_detect_device(spi, &man_id, &part, &version);
        if (rc < 0)
@@ -1198,7 +1239,8 @@ err_gpio_dir:
        if (gpio_is_valid(pdata->slp_tr))
                gpio_free(pdata->slp_tr);
 err_slp_tr:
-       gpio_free(pdata->rstn);
+       if (gpio_is_valid(pdata->rstn))
+               gpio_free(pdata->rstn);
        return rc;
 }
 
@@ -1214,7 +1256,8 @@ static int at86rf230_remove(struct spi_device *spi)
 
        if (gpio_is_valid(pdata->slp_tr))
                gpio_free(pdata->slp_tr);
-       gpio_free(pdata->rstn);
+       if (gpio_is_valid(pdata->rstn))
+               gpio_free(pdata->rstn);
 
        mutex_destroy(&lp->bmux);
        ieee802154_free_device(lp->dev);
@@ -1223,8 +1266,19 @@ static int at86rf230_remove(struct spi_device *spi)
        return 0;
 }
 
+#if IS_ENABLED(CONFIG_OF)
+static struct of_device_id at86rf230_of_match[] = {
+       { .compatible = "atmel,at86rf230", },
+       { .compatible = "atmel,at86rf231", },
+       { .compatible = "atmel,at86rf233", },
+       { .compatible = "atmel,at86rf212", },
+       { },
+};
+#endif
+
 static struct spi_driver at86rf230_driver = {
        .driver = {
+               .of_match_table = of_match_ptr(at86rf230_of_match),
                .name   = "at86rf230",
                .owner  = THIS_MODULE,
        },
index b8d8c805fd757d61bdd941661626d79022483851..4b6d12c7b803b7955848fa9309d725ba09da4d79 100644 (file)
@@ -1979,9 +1979,6 @@ struct net_device *__dev_get_by_index(struct net *net, int ifindex);
 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
 int netdev_get_name(struct net *net, char *name, int ifindex);
 int dev_restart(struct net_device *dev);
-#ifdef CONFIG_NETPOLL_TRAP
-int netpoll_trap(void);
-#endif
 int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb);
 
 static inline unsigned int skb_gro_offset(const struct sk_buff *skb)
@@ -2186,12 +2183,6 @@ static inline void netif_tx_start_all_queues(struct net_device *dev)
 
 static inline void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 {
-#ifdef CONFIG_NETPOLL_TRAP
-       if (netpoll_trap()) {
-               netif_tx_start_queue(dev_queue);
-               return;
-       }
-#endif
        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state))
                __netif_schedule(dev_queue->qdisc);
 }
@@ -2435,10 +2426,6 @@ static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index)
 static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index)
 {
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
-#ifdef CONFIG_NETPOLL_TRAP
-       if (netpoll_trap())
-               return;
-#endif
        netif_tx_stop_queue(txq);
 }
 
@@ -2473,10 +2460,6 @@ static inline bool netif_subqueue_stopped(const struct net_device *dev,
 static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
 {
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
-#ifdef CONFIG_NETPOLL_TRAP
-       if (netpoll_trap())
-               return;
-#endif
        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state))
                __netif_schedule(txq->qdisc);
 }
index 0c7d01eae56cf8626d2d1fc55242b3cbae0ba1ad..96afc29184bee810a1ec550933cfb15998f42efb 100644 (file)
@@ -39,11 +39,13 @@ enum ip_set_feature {
        IPSET_TYPE_NAME = (1 << IPSET_TYPE_NAME_FLAG),
        IPSET_TYPE_IFACE_FLAG = 5,
        IPSET_TYPE_IFACE = (1 << IPSET_TYPE_IFACE_FLAG),
-       IPSET_TYPE_NOMATCH_FLAG = 6,
+       IPSET_TYPE_MARK_FLAG = 6,
+       IPSET_TYPE_MARK = (1 << IPSET_TYPE_MARK_FLAG),
+       IPSET_TYPE_NOMATCH_FLAG = 7,
        IPSET_TYPE_NOMATCH = (1 << IPSET_TYPE_NOMATCH_FLAG),
        /* Strictly speaking not a feature, but a flag for dumping:
         * this settype must be dumped last */
-       IPSET_DUMP_LAST_FLAG = 7,
+       IPSET_DUMP_LAST_FLAG = 8,
        IPSET_DUMP_LAST = (1 << IPSET_DUMP_LAST_FLAG),
 };
 
@@ -63,6 +65,7 @@ enum ip_set_extension {
 #define SET_WITH_TIMEOUT(s)    ((s)->extensions & IPSET_EXT_TIMEOUT)
 #define SET_WITH_COUNTER(s)    ((s)->extensions & IPSET_EXT_COUNTER)
 #define SET_WITH_COMMENT(s)    ((s)->extensions & IPSET_EXT_COMMENT)
+#define SET_WITH_FORCEADD(s)   ((s)->flags & IPSET_CREATE_FLAG_FORCEADD)
 
 /* Extension id, in size order */
 enum ip_set_ext_id {
@@ -171,8 +174,6 @@ struct ip_set_type {
        char name[IPSET_MAXNAMELEN];
        /* Protocol version */
        u8 protocol;
-       /* Set features to control swapping */
-       u8 features;
        /* Set type dimension */
        u8 dimension;
        /*
@@ -182,6 +183,8 @@ struct ip_set_type {
        u8 family;
        /* Type revisions */
        u8 revision_min, revision_max;
+       /* Set features to control swapping */
+       u16 features;
 
        /* Create set */
        int (*create)(struct net *net, struct ip_set *set,
@@ -217,6 +220,8 @@ struct ip_set {
        u8 revision;
        /* Extensions */
        u8 extensions;
+       /* Create flags */
+       u8 flags;
        /* Default timeout value, if enabled */
        u32 timeout;
        /* Element data size */
@@ -251,6 +256,8 @@ ip_set_put_flags(struct sk_buff *skb, struct ip_set *set)
                cadt_flags |= IPSET_FLAG_WITH_COUNTERS;
        if (SET_WITH_COMMENT(set))
                cadt_flags |= IPSET_FLAG_WITH_COMMENT;
+       if (SET_WITH_FORCEADD(set))
+               cadt_flags |= IPSET_FLAG_WITH_FORCEADD;
 
        if (!cadt_flags)
                return 0;
index 28c74367e900ac679aa5feba98b7629ea6338af3..e955d47306259c5867d80831bd78d99490849b0b 100644 (file)
@@ -44,6 +44,27 @@ int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid,
 
 void nfnl_lock(__u8 subsys_id);
 void nfnl_unlock(__u8 subsys_id);
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_nfnl_is_held(__u8 subsys_id);
+#else
+static inline int lockdep_nfnl_is_held(__u8 subsys_id)
+{
+       return 1;
+}
+#endif /* CONFIG_PROVE_LOCKING */
+
+/*
+ * nfnl_dereference - fetch RCU pointer when updates are prevented by subsys mutex
+ *
+ * @p: The pointer to read, prior to dereferencing
+ * @ss: The nfnetlink subsystem ID
+ *
+ * Return the value of the specified RCU-protected pointer, but omit
+ * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because
+ * caller holds the NFNL subsystem mutex.
+ */
+#define nfnl_dereference(p, ss)                                        \
+       rcu_dereference_protected(p, lockdep_nfnl_is_held(ss))
 
 #define MODULE_ALIAS_NFNL_SUBSYS(subsys) \
        MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys))
index fbfdb9d8d3a7f59b4788bdfd1cdb26c86f0a81ca..1b475a5a72393e8e4f600da1a36073ed845d6bb5 100644 (file)
@@ -24,27 +24,20 @@ struct netpoll {
        struct net_device *dev;
        char dev_name[IFNAMSIZ];
        const char *name;
-       void (*rx_skb_hook)(struct netpoll *np, int source, struct sk_buff *skb,
-                           int offset, int len);
 
        union inet_addr local_ip, remote_ip;
        bool ipv6;
        u16 local_port, remote_port;
        u8 remote_mac[ETH_ALEN];
 
-       struct list_head rx; /* rx_np list element */
        struct work_struct cleanup_work;
 };
 
 struct netpoll_info {
        atomic_t refcnt;
 
-       unsigned long rx_flags;
-       spinlock_t rx_lock;
        struct semaphore dev_lock;
-       struct list_head rx_np; /* netpolls that registered an rx_skb_hook */
 
-       struct sk_buff_head neigh_tx; /* list of neigh requests to reply to */
        struct sk_buff_head txq;
 
        struct delayed_work tx_work;
@@ -66,12 +59,9 @@ void netpoll_print_options(struct netpoll *np);
 int netpoll_parse_options(struct netpoll *np, char *opt);
 int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp);
 int netpoll_setup(struct netpoll *np);
-int netpoll_trap(void);
-void netpoll_set_trap(int trap);
 void __netpoll_cleanup(struct netpoll *np);
 void __netpoll_free_async(struct netpoll *np);
 void netpoll_cleanup(struct netpoll *np);
-int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo);
 void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
                             struct net_device *dev);
 static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
@@ -82,46 +72,7 @@ static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
        local_irq_restore(flags);
 }
 
-
-
 #ifdef CONFIG_NETPOLL
-static inline bool netpoll_rx_on(struct sk_buff *skb)
-{
-       struct netpoll_info *npinfo = rcu_dereference_bh(skb->dev->npinfo);
-
-       return npinfo && (!list_empty(&npinfo->rx_np) || npinfo->rx_flags);
-}
-
-static inline bool netpoll_rx(struct sk_buff *skb)
-{
-       struct netpoll_info *npinfo;
-       unsigned long flags;
-       bool ret = false;
-
-       local_irq_save(flags);
-
-       if (!netpoll_rx_on(skb))
-               goto out;
-
-       npinfo = rcu_dereference_bh(skb->dev->npinfo);
-       spin_lock(&npinfo->rx_lock);
-       /* check rx_flags again with the lock held */
-       if (npinfo->rx_flags && __netpoll_rx(skb, npinfo))
-               ret = true;
-       spin_unlock(&npinfo->rx_lock);
-
-out:
-       local_irq_restore(flags);
-       return ret;
-}
-
-static inline int netpoll_receive_skb(struct sk_buff *skb)
-{
-       if (!list_empty(&skb->dev->napi_list))
-               return netpoll_rx(skb);
-       return 0;
-}
-
 static inline void *netpoll_poll_lock(struct napi_struct *napi)
 {
        struct net_device *dev = napi->dev;
@@ -150,18 +101,6 @@ static inline bool netpoll_tx_running(struct net_device *dev)
 }
 
 #else
-static inline bool netpoll_rx(struct sk_buff *skb)
-{
-       return false;
-}
-static inline bool netpoll_rx_on(struct sk_buff *skb)
-{
-       return false;
-}
-static inline int netpoll_receive_skb(struct sk_buff *skb)
-{
-       return 0;
-}
 static inline void *netpoll_poll_lock(struct napi_struct *napi)
 {
        return NULL;
index b2ac6246b7e0abe156b26a06bf9a4463331baa6e..37252f71a38037d0e969699dc7376e6dcae1c5ec 100644 (file)
@@ -73,10 +73,17 @@ struct nf_conn_help {
 
 struct nf_conn {
        /* Usage count in here is 1 for hash table/destruct timer, 1 per skb,
-           plus 1 for any connection(s) we are `master' for */
+        * plus 1 for any connection(s) we are `master' for
+        *
+        * Hint, SKB address this struct and refcnt via skb->nfct and
+        * helpers nf_conntrack_get() and nf_conntrack_put().
+        * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt,
+        * beware nf_ct_get() is different and don't inc refcnt.
+        */
        struct nf_conntrack ct_general;
 
-       spinlock_t lock;
+       spinlock_t      lock;
+       u16             cpu;
 
        /* XXX should I move this to the tail ? - Y.K */
        /* These are my tuples; original and reply */
index 15308b8eb5b5218330c3043c46a02537d00779a2..cc0c1882760270f21bd1cc7080cece06afd518d8 100644 (file)
@@ -77,6 +77,13 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
             const struct nf_conntrack_l3proto *l3proto,
             const struct nf_conntrack_l4proto *proto);
 
-extern spinlock_t nf_conntrack_lock ;
+#ifdef CONFIG_LOCKDEP
+# define CONNTRACK_LOCKS 8
+#else
+# define CONNTRACK_LOCKS 1024
+#endif
+extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
+
+extern spinlock_t nf_conntrack_expect_lock;
 
 #endif /* _NF_CONNTRACK_CORE_H */
index c985695283b3a34d5d78aa99b062ab4dfb839c18..dec6336bf850f77928f54f4cd04d37b3ebe3e16a 100644 (file)
@@ -7,6 +7,8 @@
 
 #include <uapi/linux/netfilter/xt_connlabel.h>
 
+#define NF_CT_LABELS_MAX_SIZE ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE)
+
 struct nf_conn_labels {
        u8 words;
        unsigned long bits[];
@@ -29,7 +31,7 @@ static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct)
        u8 words;
 
        words = ACCESS_ONCE(net->ct.label_words);
-       if (words == 0 || WARN_ON_ONCE(words > 8))
+       if (words == 0)
                return NULL;
 
        cl_ext = nf_ct_ext_add_length(ct, NF_CT_EXT_LABELS,
index e7e14ffe0f6a0e0f545af45864aa248980250da2..e6bc14d8fa9a9a4b324fac9df5a47e64277f2aa8 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <linux/list.h>
 #include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netlink.h>
@@ -288,7 +289,8 @@ struct nft_expr_ops {
        int                             (*init)(const struct nft_ctx *ctx,
                                                const struct nft_expr *expr,
                                                const struct nlattr * const tb[]);
-       void                            (*destroy)(const struct nft_expr *expr);
+       void                            (*destroy)(const struct nft_ctx *ctx,
+                                                  const struct nft_expr *expr);
        int                             (*dump)(struct sk_buff *skb,
                                                const struct nft_expr *expr);
        int                             (*validate)(const struct nft_ctx *ctx,
@@ -325,13 +327,15 @@ static inline void *nft_expr_priv(const struct nft_expr *expr)
  *     @handle: rule handle
  *     @genmask: generation mask
  *     @dlen: length of expression data
+ *     @ulen: length of user data (used for comments)
  *     @data: expression data
  */
 struct nft_rule {
        struct list_head                list;
-       u64                             handle:46,
+       u64                             handle:42,
                                        genmask:2,
-                                       dlen:16;
+                                       dlen:12,
+                                       ulen:8;
        unsigned char                   data[]
                __attribute__((aligned(__alignof__(struct nft_expr))));
 };
@@ -340,19 +344,13 @@ struct nft_rule {
  *     struct nft_rule_trans - nf_tables rule update in transaction
  *
  *     @list: used internally
+ *     @ctx: rule context
  *     @rule: rule that needs to be updated
- *     @chain: chain that this rule belongs to
- *     @table: table for which this chain applies
- *     @nlh: netlink header of the message that contain this update
- *     @family: family expressesed as AF_*
  */
 struct nft_rule_trans {
        struct list_head                list;
+       struct nft_ctx                  ctx;
        struct nft_rule                 *rule;
-       const struct nft_chain          *chain;
-       const struct nft_table          *table;
-       const struct nlmsghdr           *nlh;
-       u8                              family;
 };
 
 static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule)
@@ -370,6 +368,11 @@ static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule)
        return (struct nft_expr *)&rule->data[rule->dlen];
 }
 
+static inline void *nft_userdata(const struct nft_rule *rule)
+{
+       return (void *)&rule->data[rule->dlen];
+}
+
 /*
  * The last pointer isn't really necessary, but the compiler isn't able to
  * determine that the result of nft_expr_last() is always the same since it
@@ -521,6 +524,9 @@ void nft_unregister_chain_type(const struct nf_chain_type *);
 int nft_register_expr(struct nft_expr_type *);
 void nft_unregister_expr(struct nft_expr_type *);
 
+#define nft_dereference(p)                                     \
+       nfnl_dereference(p, NFNL_SUBSYS_NFTABLES)
+
 #define MODULE_ALIAS_NFT_FAMILY(family)        \
        MODULE_ALIAS("nft-afinfo-" __stringify(family))
 
index fbcc7fa536dc4ab49440d4566e63fedf8b860d2a..773cce308bc61ce312c4a5ecab0464ba84e1ce41 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/list_nulls.h>
 #include <linux/atomic.h>
 #include <linux/netfilter/nf_conntrack_tcp.h>
+#include <linux/seqlock.h>
 
 struct ctl_table_header;
 struct nf_conntrack_ecache;
@@ -62,6 +63,13 @@ struct nf_ip_net {
 #endif
 };
 
+struct ct_pcpu {
+       spinlock_t              lock;
+       struct hlist_nulls_head unconfirmed;
+       struct hlist_nulls_head dying;
+       struct hlist_nulls_head tmpl;
+};
+
 struct netns_ct {
        atomic_t                count;
        unsigned int            expect_count;
@@ -83,12 +91,11 @@ struct netns_ct {
        int                     sysctl_checksum;
 
        unsigned int            htable_size;
+       seqcount_t              generation;
        struct kmem_cache       *nf_conntrack_cachep;
        struct hlist_nulls_head *hash;
        struct hlist_head       *expect_hash;
-       struct hlist_nulls_head unconfirmed;
-       struct hlist_nulls_head dying;
-       struct hlist_nulls_head tmpl;
+       struct ct_pcpu __percpu *pcpu_lists;
        struct ip_conntrack_stat __percpu *stat;
        struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
        struct nf_exp_event_notifier __rcu *nf_expect_event_cb;
index 23bfd4591e8b1a299cf74447d9d8db0d349044e0..32682ae47b3fe6d88e7750120089efc791b38894 100644 (file)
@@ -121,7 +121,7 @@ struct xfrm_state_walk {
        u8                      dying;
        u8                      proto;
        u32                     seq;
-       struct xfrm_filter      *filter;
+       struct xfrm_address_filter *filter;
 };
 
 /* Full description of state of transformer. */
@@ -349,6 +349,16 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo);
 struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
 void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
 
+struct xfrm_input_afinfo {
+       unsigned int            family;
+       struct module           *owner;
+       int                     (*callback)(struct sk_buff *skb, u8 protocol,
+                                           int err);
+};
+
+int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo);
+int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo);
+
 void xfrm_state_delete_tunnel(struct xfrm_state *x);
 
 struct xfrm_type {
@@ -1364,18 +1374,22 @@ struct xfrm4_protocol {
        int priority;
 };
 
-/* XFRM tunnel handlers.  */
-struct xfrm_tunnel {
+struct xfrm6_protocol {
        int (*handler)(struct sk_buff *skb);
-       int (*err_handler)(struct sk_buff *skb, u32 info);
+       int (*cb_handler)(struct sk_buff *skb, int err);
+       int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
+                          u8 type, u8 code, int offset, __be32 info);
 
-       struct xfrm_tunnel __rcu *next;
+       struct xfrm6_protocol __rcu *next;
        int priority;
 };
 
-struct xfrm_tunnel_notifier {
+/* XFRM tunnel handlers.  */
+struct xfrm_tunnel {
        int (*handler)(struct sk_buff *skb);
-       struct xfrm_tunnel_notifier __rcu *next;
+       int (*err_handler)(struct sk_buff *skb, u32 info);
+
+       struct xfrm_tunnel __rcu *next;
        int priority;
 };
 
@@ -1392,11 +1406,14 @@ void xfrm4_init(void);
 int xfrm_state_init(struct net *net);
 void xfrm_state_fini(struct net *net);
 void xfrm4_state_init(void);
+void xfrm4_protocol_init(void);
 #ifdef CONFIG_XFRM
 int xfrm6_init(void);
 void xfrm6_fini(void);
 int xfrm6_state_init(void);
 void xfrm6_state_fini(void);
+int xfrm6_protocol_init(void);
+void xfrm6_protocol_fini(void);
 #else
 static inline int xfrm6_init(void)
 {
@@ -1423,7 +1440,7 @@ static inline void xfrm_sysctl_fini(struct net *net)
 #endif
 
 void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto,
-                         struct xfrm_filter *filter);
+                         struct xfrm_address_filter *filter);
 int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk,
                    int (*func)(struct xfrm_state *, int, void*), void *);
 void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net);
@@ -1531,8 +1548,6 @@ int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char prot
 int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family);
 int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family);
 void xfrm4_local_error(struct sk_buff *skb, u32 mtu);
-int xfrm6_mode_tunnel_input_register(struct xfrm_tunnel_notifier *handler);
-int xfrm6_mode_tunnel_input_deregister(struct xfrm_tunnel_notifier *handler);
 int xfrm6_extract_header(struct sk_buff *skb);
 int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi);
@@ -1541,6 +1556,9 @@ int xfrm6_rcv(struct sk_buff *skb);
 int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
                     xfrm_address_t *saddr, u8 proto);
 void xfrm6_local_error(struct sk_buff *skb, u32 mtu);
+int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err);
+int xfrm6_protocol_register(struct xfrm6_protocol *handler, unsigned char protocol);
+int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, unsigned char protocol);
 int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family);
 int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family);
 __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr);
@@ -1784,18 +1802,6 @@ static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m)
        return ret;
 }
 
-static inline int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family,
-                             u8 protocol, int err)
-{
-       switch(family) {
-#ifdef CONFIG_INET
-       case AF_INET:
-               return xfrm4_rcv_cb(skb, protocol, err);
-#endif
-       }
-       return 0;
-}
-
 static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x,
                                    unsigned int family)
 {
index 25d3b2f79c022e92cfd8f851e0f9ae8a2f7b731a..78c2f2e799208a467b75c8035b4d3362365362bb 100644 (file)
@@ -82,6 +82,8 @@ enum {
        IPSET_ATTR_PROTO,       /* 7 */
        IPSET_ATTR_CADT_FLAGS,  /* 8 */
        IPSET_ATTR_CADT_LINENO = IPSET_ATTR_LINENO,     /* 9 */
+       IPSET_ATTR_MARK,        /* 10 */
+       IPSET_ATTR_MARKMASK,    /* 11 */
        /* Reserve empty slots */
        IPSET_ATTR_CADT_MAX = 16,
        /* Create-only specific attributes */
@@ -144,6 +146,7 @@ enum ipset_errno {
        IPSET_ERR_IPADDR_IPV6,
        IPSET_ERR_COUNTER,
        IPSET_ERR_COMMENT,
+       IPSET_ERR_INVALID_MARKMASK,
 
        /* Type specific error codes */
        IPSET_ERR_TYPE_SPECIFIC = 4352,
@@ -182,9 +185,18 @@ enum ipset_cadt_flags {
        IPSET_FLAG_WITH_COUNTERS = (1 << IPSET_FLAG_BIT_WITH_COUNTERS),
        IPSET_FLAG_BIT_WITH_COMMENT = 4,
        IPSET_FLAG_WITH_COMMENT = (1 << IPSET_FLAG_BIT_WITH_COMMENT),
+       IPSET_FLAG_BIT_WITH_FORCEADD = 5,
+       IPSET_FLAG_WITH_FORCEADD = (1 << IPSET_FLAG_BIT_WITH_FORCEADD),
        IPSET_FLAG_CADT_MAX     = 15,
 };
 
+/* The flag bits which correspond to the non-extension create flags */
+enum ipset_create_flags {
+       IPSET_CREATE_FLAG_BIT_FORCEADD = 0,
+       IPSET_CREATE_FLAG_FORCEADD = (1 << IPSET_CREATE_FLAG_BIT_FORCEADD),
+       IPSET_CREATE_FLAG_BIT_MAX = 7,
+};
+
 /* Commands with settype-specific attributes */
 enum ipset_adt {
        IPSET_ADD,
index 83c985a6170bd52938764732f45a70d09ae2a28e..c88ccbfda5f1b111a5fa43e1d1803bcccf95b521 100644 (file)
@@ -1,7 +1,8 @@
 #ifndef _LINUX_NF_TABLES_H
 #define _LINUX_NF_TABLES_H
 
-#define NFT_CHAIN_MAXNAMELEN 32
+#define NFT_CHAIN_MAXNAMELEN   32
+#define NFT_USERDATA_MAXLEN    256
 
 enum nft_registers {
        NFT_REG_VERDICT,
@@ -156,6 +157,7 @@ enum nft_chain_attributes {
  * @NFTA_RULE_EXPRESSIONS: list of expressions (NLA_NESTED: nft_expr_attributes)
  * @NFTA_RULE_COMPAT: compatibility specifications of the rule (NLA_NESTED: nft_rule_compat_attributes)
  * @NFTA_RULE_POSITION: numeric handle of the previous rule (NLA_U64)
+ * @NFTA_RULE_USERDATA: user data (NLA_BINARY, NFT_USERDATA_MAXLEN)
  */
 enum nft_rule_attributes {
        NFTA_RULE_UNSPEC,
@@ -165,6 +167,7 @@ enum nft_rule_attributes {
        NFTA_RULE_EXPRESSIONS,
        NFTA_RULE_COMPAT,
        NFTA_RULE_POSITION,
+       NFTA_RULE_USERDATA,
        __NFTA_RULE_MAX
 };
 #define NFTA_RULE_MAX          (__NFTA_RULE_MAX - 1)
@@ -601,6 +604,7 @@ enum nft_ct_keys {
        NFT_CT_PROTOCOL,
        NFT_CT_PROTO_SRC,
        NFT_CT_PROTO_DST,
+       NFT_CT_LABELS,
 };
 
 /**
index 6550c679584f46b39f1a2f22733460ba294b66f7..25e5dd916ba491feba3f1d570e8712283a95645e 100644 (file)
@@ -299,7 +299,7 @@ enum xfrm_attr_type_t {
        XFRMA_REPLAY_ESN_VAL,   /* struct xfrm_replay_esn */
        XFRMA_SA_EXTRA_FLAGS,   /* __u32 */
        XFRMA_PROTO,            /* __u8 */
-       XFRMA_FILTER,           /* struct xfrm_filter */
+       XFRMA_ADDRESS_FILTER,   /* struct xfrm_address_filter */
        __XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
@@ -476,7 +476,7 @@ struct xfrm_user_mapping {
        __be16                          new_sport;
 };
 
-struct xfrm_filter {
+struct xfrm_address_filter {
        xfrm_address_t                  saddr;
        xfrm_address_t                  daddr;
        __u16                           family;
index 587f9fb85d73b40ea3ec6401d591394ff6973d2c..55f8e64c03a2cb8aa5364addc2ac7908f754f20a 100644 (file)
@@ -3231,10 +3231,6 @@ static int netif_rx_internal(struct sk_buff *skb)
 {
        int ret;
 
-       /* if netpoll wants it, pretend we never saw it */
-       if (netpoll_rx(skb))
-               return NET_RX_DROP;
-
        net_timestamp_check(netdev_tstamp_prequeue, skb);
 
        trace_netif_rx(skb);
@@ -3520,10 +3516,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 
        trace_netif_receive_skb(skb);
 
-       /* if we've gotten here through NAPI, check netpoll */
-       if (netpoll_receive_skb(skb))
-               goto out;
-
        orig_dev = skb->dev;
 
        skb_reset_network_header(skb);
@@ -3650,7 +3642,6 @@ drop:
 
 unlock:
        rcu_read_unlock();
-out:
        return ret;
 }
 
@@ -3875,7 +3866,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
        int same_flow;
        enum gro_result ret;
 
-       if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
+       if (!(skb->dev->features & NETIF_F_GRO))
                goto normal;
 
        if (skb_is_gso(skb) || skb_has_frag_list(skb))
index a664f7829a6d16db4579789ca5b8c40654cc78d7..7291dde934690e1acbb62d89d86897630ed9d0b2 100644 (file)
 
 static struct sk_buff_head skb_pool;
 
-static atomic_t trapped;
-
 DEFINE_STATIC_SRCU(netpoll_srcu);
 
 #define USEC_PER_POLL  50
-#define NETPOLL_RX_ENABLED  1
-#define NETPOLL_RX_DROP     2
 
 #define MAX_SKB_SIZE                                                   \
        (sizeof(struct ethhdr) +                                        \
@@ -61,7 +57,6 @@ DEFINE_STATIC_SRCU(netpoll_srcu);
         MAX_UDP_CHUNK)
 
 static void zap_completion_queue(void);
-static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo);
 static void netpoll_async_cleanup(struct work_struct *work);
 
 static unsigned int carrier_timeout = 4;
@@ -109,25 +104,6 @@ static void queue_process(struct work_struct *work)
        }
 }
 
-static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh,
-                           unsigned short ulen, __be32 saddr, __be32 daddr)
-{
-       __wsum psum;
-
-       if (uh->check == 0 || skb_csum_unnecessary(skb))
-               return 0;
-
-       psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
-
-       if (skb->ip_summed == CHECKSUM_COMPLETE &&
-           !csum_fold(csum_add(psum, skb->csum)))
-               return 0;
-
-       skb->csum = psum;
-
-       return __skb_checksum_complete(skb);
-}
-
 /*
  * Check whether delayed processing was scheduled for our NIC. If so,
  * we attempt to grab the poll lock and use ->poll() to pump the card.
@@ -138,14 +114,8 @@ static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh,
  * trylock here and interrupts are already disabled in the softirq
  * case. Further, we test the poll_owner to avoid recursion on UP
  * systems where the lock doesn't exist.
- *
- * In cases where there is bi-directional communications, reading only
- * one message at a time can lead to packets being dropped by the
- * network adapter, forcing superfluous retries and possibly timeouts.
- * Thus, we set our budget to greater than 1.
  */
-static int poll_one_napi(struct netpoll_info *npinfo,
-                        struct napi_struct *napi, int budget)
+static int poll_one_napi(struct napi_struct *napi, int budget)
 {
        int work;
 
@@ -156,52 +126,35 @@ static int poll_one_napi(struct netpoll_info *npinfo,
        if (!test_bit(NAPI_STATE_SCHED, &napi->state))
                return budget;
 
-       npinfo->rx_flags |= NETPOLL_RX_DROP;
-       atomic_inc(&trapped);
        set_bit(NAPI_STATE_NPSVC, &napi->state);
 
        work = napi->poll(napi, budget);
+       WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll);
        trace_napi_poll(napi);
 
        clear_bit(NAPI_STATE_NPSVC, &napi->state);
-       atomic_dec(&trapped);
-       npinfo->rx_flags &= ~NETPOLL_RX_DROP;
 
        return budget - work;
 }
 
-static void poll_napi(struct net_device *dev)
+static void poll_napi(struct net_device *dev, int budget)
 {
        struct napi_struct *napi;
-       int budget = 16;
 
        list_for_each_entry(napi, &dev->napi_list, dev_list) {
                if (napi->poll_owner != smp_processor_id() &&
                    spin_trylock(&napi->poll_lock)) {
-                       budget = poll_one_napi(rcu_dereference_bh(dev->npinfo),
-                                              napi, budget);
+                       budget = poll_one_napi(napi, budget);
                        spin_unlock(&napi->poll_lock);
-
-                       if (!budget)
-                               break;
                }
        }
 }
 
-static void service_neigh_queue(struct netpoll_info *npi)
-{
-       if (npi) {
-               struct sk_buff *skb;
-
-               while ((skb = skb_dequeue(&npi->neigh_tx)))
-                       netpoll_neigh_reply(skb, npi);
-       }
-}
-
 static void netpoll_poll_dev(struct net_device *dev)
 {
        const struct net_device_ops *ops;
        struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
+       int budget = 0;
 
        /* Don't do any rx activity if the dev_lock mutex is held
         * the dev_open/close paths use this to block netpoll activity
@@ -224,27 +177,10 @@ static void netpoll_poll_dev(struct net_device *dev)
        /* Process pending work on NIC */
        ops->ndo_poll_controller(dev);
 
-       poll_napi(dev);
+       poll_napi(dev, budget);
 
        up(&ni->dev_lock);
 
-       if (dev->flags & IFF_SLAVE) {
-               if (ni) {
-                       struct net_device *bond_dev;
-                       struct sk_buff *skb;
-                       struct netpoll_info *bond_ni;
-
-                       bond_dev = netdev_master_upper_dev_get_rcu(dev);
-                       bond_ni = rcu_dereference_bh(bond_dev->npinfo);
-                       while ((skb = skb_dequeue(&ni->neigh_tx))) {
-                               skb->dev = bond_dev;
-                               skb_queue_tail(&bond_ni->neigh_tx, skb);
-                       }
-               }
-       }
-
-       service_neigh_queue(ni);
-
        zap_completion_queue();
 }
 
@@ -529,384 +465,6 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 }
 EXPORT_SYMBOL(netpoll_send_udp);
 
-static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo)
-{
-       int size, type = ARPOP_REPLY;
-       __be32 sip, tip;
-       unsigned char *sha;
-       struct sk_buff *send_skb;
-       struct netpoll *np, *tmp;
-       unsigned long flags;
-       int hlen, tlen;
-       int hits = 0, proto;
-
-       if (list_empty(&npinfo->rx_np))
-               return;
-
-       /* Before checking the packet, we do some early
-          inspection whether this is interesting at all */
-       spin_lock_irqsave(&npinfo->rx_lock, flags);
-       list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
-               if (np->dev == skb->dev)
-                       hits++;
-       }
-       spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-
-       /* No netpoll struct is using this dev */
-       if (!hits)
-               return;
-
-       proto = ntohs(eth_hdr(skb)->h_proto);
-       if (proto == ETH_P_ARP) {
-               struct arphdr *arp;
-               unsigned char *arp_ptr;
-               /* No arp on this interface */
-               if (skb->dev->flags & IFF_NOARP)
-                       return;
-
-               if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
-                       return;
-
-               skb_reset_network_header(skb);
-               skb_reset_transport_header(skb);
-               arp = arp_hdr(skb);
-
-               if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
-                    arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
-                   arp->ar_pro != htons(ETH_P_IP) ||
-                   arp->ar_op != htons(ARPOP_REQUEST))
-                       return;
-
-               arp_ptr = (unsigned char *)(arp+1);
-               /* save the location of the src hw addr */
-               sha = arp_ptr;
-               arp_ptr += skb->dev->addr_len;
-               memcpy(&sip, arp_ptr, 4);
-               arp_ptr += 4;
-               /* If we actually cared about dst hw addr,
-                  it would get copied here */
-               arp_ptr += skb->dev->addr_len;
-               memcpy(&tip, arp_ptr, 4);
-
-               /* Should we ignore arp? */
-               if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
-                       return;
-
-               size = arp_hdr_len(skb->dev);
-
-               spin_lock_irqsave(&npinfo->rx_lock, flags);
-               list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
-                       if (tip != np->local_ip.ip)
-                               continue;
-
-                       hlen = LL_RESERVED_SPACE(np->dev);
-                       tlen = np->dev->needed_tailroom;
-                       send_skb = find_skb(np, size + hlen + tlen, hlen);
-                       if (!send_skb)
-                               continue;
-
-                       skb_reset_network_header(send_skb);
-                       arp = (struct arphdr *) skb_put(send_skb, size);
-                       send_skb->dev = skb->dev;
-                       send_skb->protocol = htons(ETH_P_ARP);
-
-                       /* Fill the device header for the ARP frame */
-                       if (dev_hard_header(send_skb, skb->dev, ETH_P_ARP,
-                                           sha, np->dev->dev_addr,
-                                           send_skb->len) < 0) {
-                               kfree_skb(send_skb);
-                               continue;
-                       }
-
-                       /*
-                        * Fill out the arp protocol part.
-                        *
-                        * we only support ethernet device type,
-                        * which (according to RFC 1390) should
-                        * always equal 1 (Ethernet).
-                        */
-
-                       arp->ar_hrd = htons(np->dev->type);
-                       arp->ar_pro = htons(ETH_P_IP);
-                       arp->ar_hln = np->dev->addr_len;
-                       arp->ar_pln = 4;
-                       arp->ar_op = htons(type);
-
-                       arp_ptr = (unsigned char *)(arp + 1);
-                       memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len);
-                       arp_ptr += np->dev->addr_len;
-                       memcpy(arp_ptr, &tip, 4);
-                       arp_ptr += 4;
-                       memcpy(arp_ptr, sha, np->dev->addr_len);
-                       arp_ptr += np->dev->addr_len;
-                       memcpy(arp_ptr, &sip, 4);
-
-                       netpoll_send_skb(np, send_skb);
-
-                       /* If there are several rx_skb_hooks for the same
-                        * address we're fine by sending a single reply
-                        */
-                       break;
-               }
-               spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-       } else if( proto == ETH_P_IPV6) {
-#if IS_ENABLED(CONFIG_IPV6)
-               struct nd_msg *msg;
-               u8 *lladdr = NULL;
-               struct ipv6hdr *hdr;
-               struct icmp6hdr *icmp6h;
-               const struct in6_addr *saddr;
-               const struct in6_addr *daddr;
-               struct inet6_dev *in6_dev = NULL;
-               struct in6_addr *target;
-
-               in6_dev = in6_dev_get(skb->dev);
-               if (!in6_dev || !in6_dev->cnf.accept_ra)
-                       return;
-
-               if (!pskb_may_pull(skb, skb->len))
-                       return;
-
-               msg = (struct nd_msg *)skb_transport_header(skb);
-
-               __skb_push(skb, skb->data - skb_transport_header(skb));
-
-               if (ipv6_hdr(skb)->hop_limit != 255)
-                       return;
-               if (msg->icmph.icmp6_code != 0)
-                       return;
-               if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
-                       return;
-
-               saddr = &ipv6_hdr(skb)->saddr;
-               daddr = &ipv6_hdr(skb)->daddr;
-
-               size = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
-
-               spin_lock_irqsave(&npinfo->rx_lock, flags);
-               list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
-                       if (!ipv6_addr_equal(daddr, &np->local_ip.in6))
-                               continue;
-
-                       hlen = LL_RESERVED_SPACE(np->dev);
-                       tlen = np->dev->needed_tailroom;
-                       send_skb = find_skb(np, size + hlen + tlen, hlen);
-                       if (!send_skb)
-                               continue;
-
-                       send_skb->protocol = htons(ETH_P_IPV6);
-                       send_skb->dev = skb->dev;
-
-                       skb_reset_network_header(send_skb);
-                       hdr = (struct ipv6hdr *) skb_put(send_skb, sizeof(struct ipv6hdr));
-                       *(__be32*)hdr = htonl(0x60000000);
-                       hdr->payload_len = htons(size);
-                       hdr->nexthdr = IPPROTO_ICMPV6;
-                       hdr->hop_limit = 255;
-                       hdr->saddr = *saddr;
-                       hdr->daddr = *daddr;
-
-                       icmp6h = (struct icmp6hdr *) skb_put(send_skb, sizeof(struct icmp6hdr));
-                       icmp6h->icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
-                       icmp6h->icmp6_router = 0;
-                       icmp6h->icmp6_solicited = 1;
-
-                       target = (struct in6_addr *) skb_put(send_skb, sizeof(struct in6_addr));
-                       *target = msg->target;
-                       icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, size,
-                                                             IPPROTO_ICMPV6,
-                                                             csum_partial(icmp6h,
-                                                                          size, 0));
-
-                       if (dev_hard_header(send_skb, skb->dev, ETH_P_IPV6,
-                                           lladdr, np->dev->dev_addr,
-                                           send_skb->len) < 0) {
-                               kfree_skb(send_skb);
-                               continue;
-                       }
-
-                       netpoll_send_skb(np, send_skb);
-
-                       /* If there are several rx_skb_hooks for the same
-                        * address, we're fine by sending a single reply
-                        */
-                       break;
-               }
-               spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-#endif
-       }
-}
-
-static bool pkt_is_ns(struct sk_buff *skb)
-{
-       struct nd_msg *msg;
-       struct ipv6hdr *hdr;
-
-       if (skb->protocol != htons(ETH_P_ARP))
-               return false;
-       if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)))
-               return false;
-
-       msg = (struct nd_msg *)skb_transport_header(skb);
-       __skb_push(skb, skb->data - skb_transport_header(skb));
-       hdr = ipv6_hdr(skb);
-
-       if (hdr->nexthdr != IPPROTO_ICMPV6)
-               return false;
-       if (hdr->hop_limit != 255)
-               return false;
-       if (msg->icmph.icmp6_code != 0)
-               return false;
-       if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
-               return false;
-
-       return true;
-}
-
-int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
-{
-       int proto, len, ulen, data_len;
-       int hits = 0, offset;
-       const struct iphdr *iph;
-       struct udphdr *uh;
-       struct netpoll *np, *tmp;
-       uint16_t source;
-
-       if (list_empty(&npinfo->rx_np))
-               goto out;
-
-       if (skb->dev->type != ARPHRD_ETHER)
-               goto out;
-
-       /* check if netpoll clients need ARP */
-       if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) {
-               skb_queue_tail(&npinfo->neigh_tx, skb);
-               return 1;
-       } else if (pkt_is_ns(skb) && atomic_read(&trapped)) {
-               skb_queue_tail(&npinfo->neigh_tx, skb);
-               return 1;
-       }
-
-       if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
-               skb = vlan_untag(skb);
-               if (unlikely(!skb))
-                       goto out;
-       }
-
-       proto = ntohs(eth_hdr(skb)->h_proto);
-       if (proto != ETH_P_IP && proto != ETH_P_IPV6)
-               goto out;
-       if (skb->pkt_type == PACKET_OTHERHOST)
-               goto out;
-       if (skb_shared(skb))
-               goto out;
-
-       if (proto == ETH_P_IP) {
-               if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-                       goto out;
-               iph = (struct iphdr *)skb->data;
-               if (iph->ihl < 5 || iph->version != 4)
-                       goto out;
-               if (!pskb_may_pull(skb, iph->ihl*4))
-                       goto out;
-               iph = (struct iphdr *)skb->data;
-               if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
-                       goto out;
-
-               len = ntohs(iph->tot_len);
-               if (skb->len < len || len < iph->ihl*4)
-                       goto out;
-
-               /*
-                * Our transport medium may have padded the buffer out.
-                * Now We trim to the true length of the frame.
-                */
-               if (pskb_trim_rcsum(skb, len))
-                       goto out;
-
-               iph = (struct iphdr *)skb->data;
-               if (iph->protocol != IPPROTO_UDP)
-                       goto out;
-
-               len -= iph->ihl*4;
-               uh = (struct udphdr *)(((char *)iph) + iph->ihl*4);
-               offset = (unsigned char *)(uh + 1) - skb->data;
-               ulen = ntohs(uh->len);
-               data_len = skb->len - offset;
-               source = ntohs(uh->source);
-
-               if (ulen != len)
-                       goto out;
-               if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
-                       goto out;
-               list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
-                       if (np->local_ip.ip && np->local_ip.ip != iph->daddr)
-                               continue;
-                       if (np->remote_ip.ip && np->remote_ip.ip != iph->saddr)
-                               continue;
-                       if (np->local_port && np->local_port != ntohs(uh->dest))
-                               continue;
-
-                       np->rx_skb_hook(np, source, skb, offset, data_len);
-                       hits++;
-               }
-       } else {
-#if IS_ENABLED(CONFIG_IPV6)
-               const struct ipv6hdr *ip6h;
-
-               if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
-                       goto out;
-               ip6h = (struct ipv6hdr *)skb->data;
-               if (ip6h->version != 6)
-                       goto out;
-               len = ntohs(ip6h->payload_len);
-               if (!len)
-                       goto out;
-               if (len + sizeof(struct ipv6hdr) > skb->len)
-                       goto out;
-               if (pskb_trim_rcsum(skb, len + sizeof(struct ipv6hdr)))
-                       goto out;
-               ip6h = ipv6_hdr(skb);
-               if (!pskb_may_pull(skb, sizeof(struct udphdr)))
-                       goto out;
-               uh = udp_hdr(skb);
-               offset = (unsigned char *)(uh + 1) - skb->data;
-               ulen = ntohs(uh->len);
-               data_len = skb->len - offset;
-               source = ntohs(uh->source);
-               if (ulen != skb->len)
-                       goto out;
-               if (udp6_csum_init(skb, uh, IPPROTO_UDP))
-                       goto out;
-               list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
-                       if (!ipv6_addr_equal(&np->local_ip.in6, &ip6h->daddr))
-                               continue;
-                       if (!ipv6_addr_equal(&np->remote_ip.in6, &ip6h->saddr))
-                               continue;
-                       if (np->local_port && np->local_port != ntohs(uh->dest))
-                               continue;
-
-                       np->rx_skb_hook(np, source, skb, offset, data_len);
-                       hits++;
-               }
-#endif
-       }
-
-       if (!hits)
-               goto out;
-
-       kfree_skb(skb);
-       return 1;
-
-out:
-       if (atomic_read(&trapped)) {
-               kfree_skb(skb);
-               return 1;
-       }
-
-       return 0;
-}
-
 void netpoll_print_options(struct netpoll *np)
 {
        np_info(np, "local port %d\n", np->local_port);
@@ -1030,7 +588,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
 {
        struct netpoll_info *npinfo;
        const struct net_device_ops *ops;
-       unsigned long flags;
        int err;
 
        np->dev = ndev;
@@ -1052,12 +609,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
                        goto out;
                }
 
-               npinfo->rx_flags = 0;
-               INIT_LIST_HEAD(&npinfo->rx_np);
-
-               spin_lock_init(&npinfo->rx_lock);
                sema_init(&npinfo->dev_lock, 1);
-               skb_queue_head_init(&npinfo->neigh_tx);
                skb_queue_head_init(&npinfo->txq);
                INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
 
@@ -1076,13 +628,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
 
        npinfo->netpoll = np;
 
-       if (np->rx_skb_hook) {
-               spin_lock_irqsave(&npinfo->rx_lock, flags);
-               npinfo->rx_flags |= NETPOLL_RX_ENABLED;
-               list_add_tail(&np->rx, &npinfo->rx_np);
-               spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-       }
-
        /* last thing to do is link it to the net device structure */
        rcu_assign_pointer(ndev->npinfo, npinfo);
 
@@ -1231,7 +776,6 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
        struct netpoll_info *npinfo =
                        container_of(rcu_head, struct netpoll_info, rcu);
 
-       skb_queue_purge(&npinfo->neigh_tx);
        skb_queue_purge(&npinfo->txq);
 
        /* we can't call cancel_delayed_work_sync here, as we are in softirq */
@@ -1247,7 +791,6 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
 void __netpoll_cleanup(struct netpoll *np)
 {
        struct netpoll_info *npinfo;
-       unsigned long flags;
 
        /* rtnl_dereference would be preferable here but
         * rcu_cleanup_netpoll path can put us in here safely without
@@ -1257,14 +800,6 @@ void __netpoll_cleanup(struct netpoll *np)
        if (!npinfo)
                return;
 
-       if (!list_empty(&npinfo->rx_np)) {
-               spin_lock_irqsave(&npinfo->rx_lock, flags);
-               list_del(&np->rx);
-               if (list_empty(&npinfo->rx_np))
-                       npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
-               spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-       }
-
        synchronize_srcu(&netpoll_srcu);
 
        if (atomic_dec_and_test(&npinfo->refcnt)) {
@@ -1308,18 +843,3 @@ out:
        rtnl_unlock();
 }
 EXPORT_SYMBOL(netpoll_cleanup);
-
-int netpoll_trap(void)
-{
-       return atomic_read(&trapped);
-}
-EXPORT_SYMBOL(netpoll_trap);
-
-void netpoll_set_trap(int trap)
-{
-       if (trap)
-               atomic_inc(&trapped);
-       else
-               atomic_dec(&trapped);
-}
-EXPORT_SYMBOL(netpoll_set_trap);
index c3e0adea9c277585f01dff2f1740787bb3d19a66..7ebd6e37875cc95b08d294ff64306925d05e550e 100644 (file)
@@ -61,7 +61,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
                skb_dst_set(skb, NULL);
                dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
                if (IS_ERR(dst))
-                       return PTR_ERR(dst);;
+                       return PTR_ERR(dst);
                skb_dst_set(skb, dst);
        }
 #endif
index e1a63930a96789b7df67a8b9914cd3bb69fb1cd9..6156f68a1e90b53f7504a1e6f729b60c29d52b3a 100644 (file)
@@ -325,6 +325,7 @@ void __init xfrm4_init(void)
 
        xfrm4_state_init();
        xfrm4_policy_init();
+       xfrm4_protocol_init();
 #ifdef CONFIG_SYSCTL
        register_pernet_subsys(&xfrm4_net_ops);
 #endif
index cdc09efca4420d3b02b15e6b7bd535a3b2dff4cf..7f7b243e8139defccf14165971a92a7261c29b71 100644 (file)
@@ -179,6 +179,12 @@ static const struct net_protocol ipcomp4_protocol = {
        .netns_ok       =       1,
 };
 
+static struct xfrm_input_afinfo xfrm4_input_afinfo = {
+       .family         =       AF_INET,
+       .owner          =       THIS_MODULE,
+       .callback       =       xfrm4_rcv_cb,
+};
+
 static inline const struct net_protocol *netproto(unsigned char protocol)
 {
        switch (protocol) {
@@ -199,7 +205,6 @@ int xfrm4_protocol_register(struct xfrm4_protocol *handler,
        struct xfrm4_protocol __rcu **pprev;
        struct xfrm4_protocol *t;
        bool add_netproto = false;
-
        int ret = -EEXIST;
        int priority = handler->priority;
 
@@ -273,3 +278,9 @@ int xfrm4_protocol_deregister(struct xfrm4_protocol *handler,
        return ret;
 }
 EXPORT_SYMBOL(xfrm4_protocol_deregister);
+
+void __init xfrm4_protocol_init(void)
+{
+       xfrm_input_register_afinfo(&xfrm4_input_afinfo);
+}
+EXPORT_SYMBOL(xfrm4_protocol_init);
index 17bb830872db21e07080ed9f6a0ef93cbccee9b7..2fe68364bb20610c39f20a21b4d32498ff5741d1 100644 (file)
@@ -16,7 +16,7 @@ ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o
 ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o
 
 ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
-       xfrm6_output.o
+       xfrm6_output.o xfrm6_protocol.o
 ipv6-$(CONFIG_NETFILTER) += netfilter.o
 ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
 ipv6-$(CONFIG_PROC_FS) += proc.o
index 6c5f0949e0abf7513e923d626b1e0af52b9024c4..72a4930bdc0a0e0d43e1a6ad8670e6a1df1608f4 100644 (file)
@@ -643,8 +643,8 @@ out:
        return err;
 }
 
-static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-                   u8 type, u8 code, int offset, __be32 info)
+static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+                  u8 type, u8 code, int offset, __be32 info)
 {
        struct net *net = dev_net(skb->dev);
        struct ipv6hdr *iph = (struct ipv6hdr*)skb->data;
@@ -653,17 +653,19 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
        if (type != ICMPV6_PKT_TOOBIG &&
            type != NDISC_REDIRECT)
-               return;
+               return 0;
 
        x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6);
        if (!x)
-               return;
+               return 0;
 
        if (type == NDISC_REDIRECT)
                ip6_redirect(skb, net, skb->dev->ifindex, 0);
        else
                ip6_update_pmtu(skb, net, info, 0, 0);
        xfrm_state_put(x);
+
+       return 0;
 }
 
 static int ah6_init_state(struct xfrm_state *x)
@@ -748,6 +750,11 @@ static void ah6_destroy(struct xfrm_state *x)
        kfree(ahp);
 }
 
+static int ah6_rcv_cb(struct sk_buff *skb, int err)
+{
+       return 0;
+}
+
 static const struct xfrm_type ah6_type =
 {
        .description    = "AH6",
@@ -761,10 +768,11 @@ static const struct xfrm_type ah6_type =
        .hdr_offset     = xfrm6_find_1stfragopt,
 };
 
-static const struct inet6_protocol ah6_protocol = {
+static struct xfrm6_protocol ah6_protocol = {
        .handler        =       xfrm6_rcv,
+       .cb_handler     =       ah6_rcv_cb,
        .err_handler    =       ah6_err,
-       .flags          =       INET6_PROTO_NOPOLICY,
+       .priority       =       0,
 };
 
 static int __init ah6_init(void)
@@ -774,7 +782,7 @@ static int __init ah6_init(void)
                return -EAGAIN;
        }
 
-       if (inet6_add_protocol(&ah6_protocol, IPPROTO_AH) < 0) {
+       if (xfrm6_protocol_register(&ah6_protocol, IPPROTO_AH) < 0) {
                pr_info("%s: can't add protocol\n", __func__);
                xfrm_unregister_type(&ah6_type, AF_INET6);
                return -EAGAIN;
@@ -785,7 +793,7 @@ static int __init ah6_init(void)
 
 static void __exit ah6_fini(void)
 {
-       if (inet6_del_protocol(&ah6_protocol, IPPROTO_AH) < 0)
+       if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0)
                pr_info("%s: can't remove protocol\n", __func__);
 
        if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0)
index 6eef8a7e35f2c54514e6bbb8871d18ff18ae691e..d15da1377149d3a0fdf846a7a07364e6d1251845 100644 (file)
@@ -421,8 +421,8 @@ static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
                 net_adj) & ~(blksize - 1)) + net_adj - 2;
 }
 
-static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-                    u8 type, u8 code, int offset, __be32 info)
+static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+                   u8 type, u8 code, int offset, __be32 info)
 {
        struct net *net = dev_net(skb->dev);
        const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
@@ -431,18 +431,20 @@ static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
        if (type != ICMPV6_PKT_TOOBIG &&
            type != NDISC_REDIRECT)
-               return;
+               return 0;
 
        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              esph->spi, IPPROTO_ESP, AF_INET6);
        if (!x)
-               return;
+               return 0;
 
        if (type == NDISC_REDIRECT)
                ip6_redirect(skb, net, skb->dev->ifindex, 0);
        else
                ip6_update_pmtu(skb, net, info, 0, 0);
        xfrm_state_put(x);
+
+       return 0;
 }
 
 static void esp6_destroy(struct xfrm_state *x)
@@ -614,6 +616,11 @@ error:
        return err;
 }
 
+static int esp6_rcv_cb(struct sk_buff *skb, int err)
+{
+       return 0;
+}
+
 static const struct xfrm_type esp6_type =
 {
        .description    = "ESP6",
@@ -628,10 +635,11 @@ static const struct xfrm_type esp6_type =
        .hdr_offset     = xfrm6_find_1stfragopt,
 };
 
-static const struct inet6_protocol esp6_protocol = {
-       .handler        =       xfrm6_rcv,
+static struct xfrm6_protocol esp6_protocol = {
+       .handler        =       xfrm6_rcv,
+       .cb_handler     =       esp6_rcv_cb,
        .err_handler    =       esp6_err,
-       .flags          =       INET6_PROTO_NOPOLICY,
+       .priority       =       0,
 };
 
 static int __init esp6_init(void)
@@ -640,7 +648,7 @@ static int __init esp6_init(void)
                pr_info("%s: can't add xfrm type\n", __func__);
                return -EAGAIN;
        }
-       if (inet6_add_protocol(&esp6_protocol, IPPROTO_ESP) < 0) {
+       if (xfrm6_protocol_register(&esp6_protocol, IPPROTO_ESP) < 0) {
                pr_info("%s: can't add protocol\n", __func__);
                xfrm_unregister_type(&esp6_type, AF_INET6);
                return -EAGAIN;
@@ -651,7 +659,7 @@ static int __init esp6_init(void)
 
 static void __exit esp6_fini(void)
 {
-       if (inet6_del_protocol(&esp6_protocol, IPPROTO_ESP) < 0)
+       if (xfrm6_protocol_deregister(&esp6_protocol, IPPROTO_ESP) < 0)
                pr_info("%s: can't remove protocol\n", __func__);
        if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0)
                pr_info("%s: can't remove xfrm type\n", __func__);
index 8649143993913a6bddcd8b5e4a92af8bb24bd524..b7c0f827140b402685cc29049cb56646471c2cf2 100644 (file)
@@ -278,7 +278,6 @@ static void vti6_dev_uninit(struct net_device *dev)
                RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL);
        else
                vti6_tnl_unlink(ip6n, t);
-       ip6_tnl_dst_reset(t);
        dev_put(dev);
 }
 
@@ -288,11 +287,8 @@ static int vti6_rcv(struct sk_buff *skb)
        const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
 
        rcu_read_lock();
-
        if ((t = vti6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr,
                                 &ipv6h->daddr)) != NULL) {
-               struct pcpu_sw_netstats *tstats;
-
                if (t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) {
                        rcu_read_unlock();
                        goto discard;
@@ -309,27 +305,58 @@ static int vti6_rcv(struct sk_buff *skb)
                        goto discard;
                }
 
-               tstats = this_cpu_ptr(t->dev->tstats);
-               u64_stats_update_begin(&tstats->syncp);
-               tstats->rx_packets++;
-               tstats->rx_bytes += skb->len;
-               u64_stats_update_end(&tstats->syncp);
-
-               skb->mark = 0;
-               secpath_reset(skb);
-               skb->dev = t->dev;
+               XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
+               skb->mark = be32_to_cpu(t->parms.i_key);
 
                rcu_read_unlock();
-               return 0;
+
+               return xfrm6_rcv(skb);
        }
        rcu_read_unlock();
-       return 1;
-
+       return -EINVAL;
 discard:
        kfree_skb(skb);
        return 0;
 }
 
+static int vti6_rcv_cb(struct sk_buff *skb, int err)
+{
+       unsigned short family;
+       struct net_device *dev;
+       struct pcpu_sw_netstats *tstats;
+       struct xfrm_state *x;
+       struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6;
+
+       if (!t)
+               return 1;
+
+       dev = t->dev;
+
+       if (err) {
+               dev->stats.rx_errors++;
+               dev->stats.rx_dropped++;
+
+               return 0;
+       }
+
+       x = xfrm_input_state(skb);
+       family = x->inner_mode->afinfo->family;
+
+       if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family))
+               return -EPERM;
+
+       skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev)));
+       skb->dev = dev;
+
+       tstats = this_cpu_ptr(dev->tstats);
+       u64_stats_update_begin(&tstats->syncp);
+       tstats->rx_packets++;
+       tstats->rx_bytes += skb->len;
+       u64_stats_update_end(&tstats->syncp);
+
+       return 0;
+}
+
 /**
  * vti6_addr_conflict - compare packet addresses to tunnel's own
  *   @t: the outgoing tunnel device
@@ -349,44 +376,56 @@ vti6_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr)
        return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr);
 }
 
+static bool vti6_state_check(const struct xfrm_state *x,
+                            const struct in6_addr *dst,
+                            const struct in6_addr *src)
+{
+       xfrm_address_t *daddr = (xfrm_address_t *)dst;
+       xfrm_address_t *saddr = (xfrm_address_t *)src;
+
+       /* if there is no transform then this tunnel is not functional.
+        * Or if the xfrm is not mode tunnel.
+        */
+       if (!x || x->props.mode != XFRM_MODE_TUNNEL ||
+           x->props.family != AF_INET6)
+               return false;
+
+       if (ipv6_addr_any(dst))
+               return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET6);
+
+       if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET6))
+               return false;
+
+       return true;
+}
+
 /**
  * vti6_xmit - send a packet
  *   @skb: the outgoing socket buffer
  *   @dev: the outgoing tunnel device
+ *   @fl: the flow informations for the xfrm_lookup
  **/
-static int vti6_xmit(struct sk_buff *skb, struct net_device *dev)
+static int
+vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
 {
-       struct net *net = dev_net(dev);
        struct ip6_tnl *t = netdev_priv(dev);
        struct net_device_stats *stats = &t->dev->stats;
-       struct dst_entry *dst = NULL, *ndst = NULL;
-       struct flowi6 fl6;
-       struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+       struct dst_entry *dst = skb_dst(skb);
        struct net_device *tdev;
        int err = -1;
 
-       if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) ||
-           !ip6_tnl_xmit_ctl(t) || vti6_addr_conflict(t, ipv6h))
-               return err;
-
-       dst = ip6_tnl_dst_check(t);
-       if (!dst) {
-               memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-
-               ndst = ip6_route_output(net, NULL, &fl6);
+       if (!dst)
+               goto tx_err_link_failure;
 
-               if (ndst->error)
-                       goto tx_err_link_failure;
-               ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(&fl6), NULL, 0);
-               if (IS_ERR(ndst)) {
-                       err = PTR_ERR(ndst);
-                       ndst = NULL;
-                       goto tx_err_link_failure;
-               }
-               dst = ndst;
+       dst_hold(dst);
+       dst = xfrm_lookup(t->net, dst, fl, NULL, 0);
+       if (IS_ERR(dst)) {
+               err = PTR_ERR(dst);
+               dst = NULL;
+               goto tx_err_link_failure;
        }
 
-       if (!dst->xfrm || dst->xfrm->props.mode != XFRM_MODE_TUNNEL)
+       if (!vti6_state_check(dst->xfrm, &t->parms.raddr, &t->parms.laddr))
                goto tx_err_link_failure;
 
        tdev = dst->dev;
@@ -398,14 +437,21 @@ static int vti6_xmit(struct sk_buff *skb, struct net_device *dev)
                goto tx_err_dst_release;
        }
 
+       skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
+       skb_dst_set(skb, dst);
+       skb->dev = skb_dst(skb)->dev;
 
-       skb_dst_drop(skb);
-       skb_dst_set_noref(skb, dst);
+       err = dst_output(skb);
+       if (net_xmit_eval(err) == 0) {
+               struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
 
-       ip6tunnel_xmit(skb, dev);
-       if (ndst) {
-               dev->mtu = dst_mtu(ndst);
-               ip6_tnl_dst_store(t, ndst);
+               u64_stats_update_begin(&tstats->syncp);
+               tstats->tx_bytes += skb->len;
+               tstats->tx_packets++;
+               u64_stats_update_end(&tstats->syncp);
+       } else {
+               stats->tx_errors++;
+               stats->tx_aborted_errors++;
        }
 
        return 0;
@@ -413,7 +459,7 @@ tx_err_link_failure:
        stats->tx_carrier_errors++;
        dst_link_failure(skb);
 tx_err_dst_release:
-       dst_release(ndst);
+       dst_release(dst);
        return err;
 }
 
@@ -422,16 +468,33 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        struct ip6_tnl *t = netdev_priv(dev);
        struct net_device_stats *stats = &t->dev->stats;
+       struct ipv6hdr *ipv6h;
+       struct flowi fl;
        int ret;
 
+       memset(&fl, 0, sizeof(fl));
+       skb->mark = be32_to_cpu(t->parms.o_key);
+
        switch (skb->protocol) {
        case htons(ETH_P_IPV6):
-               ret = vti6_xmit(skb, dev);
+               ipv6h = ipv6_hdr(skb);
+
+               if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) ||
+                   !ip6_tnl_xmit_ctl(t) || vti6_addr_conflict(t, ipv6h))
+                       goto tx_err;
+
+               xfrm_decode_session(skb, &fl, AF_INET6);
+               memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+               break;
+       case htons(ETH_P_IP):
+               xfrm_decode_session(skb, &fl, AF_INET);
+               memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
                break;
        default:
                goto tx_err;
        }
 
+       ret = vti6_xmit(skb, dev, &fl);
        if (ret < 0)
                goto tx_err;
 
@@ -444,24 +507,66 @@ tx_err:
        return NETDEV_TX_OK;
 }
 
+static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+                   u8 type, u8 code, int offset, __be32 info)
+{
+       __be32 spi;
+       struct xfrm_state *x;
+       struct ip6_tnl *t;
+       struct ip_esp_hdr *esph;
+       struct ip_auth_hdr *ah;
+       struct ip_comp_hdr *ipch;
+       struct net *net = dev_net(skb->dev);
+       const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
+       int protocol = iph->nexthdr;
+
+       t = vti6_tnl_lookup(dev_net(skb->dev), &iph->daddr, &iph->saddr);
+       if (!t)
+               return -1;
+
+       switch (protocol) {
+       case IPPROTO_ESP:
+               esph = (struct ip_esp_hdr *)(skb->data + offset);
+               spi = esph->spi;
+               break;
+       case IPPROTO_AH:
+               ah = (struct ip_auth_hdr *)(skb->data + offset);
+               spi = ah->spi;
+               break;
+       case IPPROTO_COMP:
+               ipch = (struct ip_comp_hdr *)(skb->data + offset);
+               spi = htonl(ntohs(ipch->cpi));
+               break;
+       default:
+               return 0;
+       }
+
+       if (type != ICMPV6_PKT_TOOBIG &&
+           type != NDISC_REDIRECT)
+               return 0;
+
+       x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+                             spi, protocol, AF_INET6);
+       if (!x)
+               return 0;
+
+       if (type == NDISC_REDIRECT)
+               ip6_redirect(skb, net, skb->dev->ifindex, 0);
+       else
+               ip6_update_pmtu(skb, net, info, 0, 0);
+       xfrm_state_put(x);
+
+       return 0;
+}
+
 static void vti6_link_config(struct ip6_tnl *t)
 {
-       struct dst_entry *dst;
        struct net_device *dev = t->dev;
        struct __ip6_tnl_parm *p = &t->parms;
-       struct flowi6 *fl6 = &t->fl.u.ip6;
 
        memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
        memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
 
-       /* Set up flowi template */
-       fl6->saddr = p->laddr;
-       fl6->daddr = p->raddr;
-       fl6->flowi6_oif = p->link;
-       fl6->flowi6_mark = be32_to_cpu(p->i_key);
-       fl6->flowi6_proto = p->proto;
-       fl6->flowlabel = 0;
-
        p->flags &= ~(IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV |
                      IP6_TNL_F_CAP_PER_PACKET);
        p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr);
@@ -472,28 +577,6 @@ static void vti6_link_config(struct ip6_tnl *t)
                dev->flags &= ~IFF_POINTOPOINT;
 
        dev->iflink = p->link;
-
-       if (p->flags & IP6_TNL_F_CAP_XMIT) {
-
-               dst = ip6_route_output(dev_net(dev), NULL, fl6);
-               if (dst->error)
-                       return;
-
-               dst = xfrm_lookup(dev_net(dev), dst, flowi6_to_flowi(fl6),
-                                 NULL, 0);
-               if (IS_ERR(dst))
-                       return;
-
-               if (dst->dev) {
-                       dev->hard_header_len = dst->dev->hard_header_len;
-
-                       dev->mtu = dst_mtu(dst);
-
-                       if (dev->mtu < IPV6_MIN_MTU)
-                               dev->mtu = IPV6_MIN_MTU;
-               }
-               dst_release(dst);
-       }
 }
 
 /**
@@ -720,7 +803,6 @@ static void vti6_dev_setup(struct net_device *dev)
        t = netdev_priv(dev);
        dev->flags |= IFF_NOARP;
        dev->addr_len = sizeof(struct in6_addr);
-       dev->features |= NETIF_F_NETNS_LOCAL;
        dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
 }
 
@@ -908,11 +990,6 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = {
        .fill_info      = vti6_fill_info,
 };
 
-static struct xfrm_tunnel_notifier vti6_handler __read_mostly = {
-       .handler        = vti6_rcv,
-       .priority       =       1,
-};
-
 static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n)
 {
        int h;
@@ -984,6 +1061,27 @@ static struct pernet_operations vti6_net_ops = {
        .size = sizeof(struct vti6_net),
 };
 
+static struct xfrm6_protocol vti_esp6_protocol __read_mostly = {
+       .handler        =       vti6_rcv,
+       .cb_handler     =       vti6_rcv_cb,
+       .err_handler    =       vti6_err,
+       .priority       =       100,
+};
+
+static struct xfrm6_protocol vti_ah6_protocol __read_mostly = {
+       .handler        =       vti6_rcv,
+       .cb_handler     =       vti6_rcv_cb,
+       .err_handler    =       vti6_err,
+       .priority       =       100,
+};
+
+static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = {
+       .handler        =       vti6_rcv,
+       .cb_handler     =       vti6_rcv_cb,
+       .err_handler    =       vti6_err,
+       .priority       =       100,
+};
+
 /**
  * vti6_tunnel_init - register protocol and reserve needed resources
  *
@@ -997,11 +1095,33 @@ static int __init vti6_tunnel_init(void)
        if (err < 0)
                goto out_pernet;
 
-       err = xfrm6_mode_tunnel_input_register(&vti6_handler);
+       err = xfrm6_protocol_register(&vti_esp6_protocol, IPPROTO_ESP);
        if (err < 0) {
-               pr_err("%s: can't register vti6\n", __func__);
+               unregister_pernet_device(&vti6_net_ops);
+               pr_err("%s: can't register vti6 protocol\n", __func__);
+
                goto out;
        }
+
+       err = xfrm6_protocol_register(&vti_ah6_protocol, IPPROTO_AH);
+       if (err < 0) {
+               xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
+               unregister_pernet_device(&vti6_net_ops);
+               pr_err("%s: can't register vti6 protocol\n", __func__);
+
+               goto out;
+       }
+
+       err = xfrm6_protocol_register(&vti_ipcomp6_protocol, IPPROTO_COMP);
+       if (err < 0) {
+               xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
+               xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
+               unregister_pernet_device(&vti6_net_ops);
+               pr_err("%s: can't register vti6 protocol\n", __func__);
+
+               goto out;
+       }
+
        err = rtnl_link_register(&vti6_link_ops);
        if (err < 0)
                goto rtnl_link_failed;
@@ -1009,7 +1129,9 @@ static int __init vti6_tunnel_init(void)
        return 0;
 
 rtnl_link_failed:
-       xfrm6_mode_tunnel_input_deregister(&vti6_handler);
+       xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP);
+       xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
+       xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
 out:
        unregister_pernet_device(&vti6_net_ops);
 out_pernet:
@@ -1022,8 +1144,12 @@ out_pernet:
 static void __exit vti6_tunnel_cleanup(void)
 {
        rtnl_link_unregister(&vti6_link_ops);
-       if (xfrm6_mode_tunnel_input_deregister(&vti6_handler))
-               pr_info("%s: can't deregister vti6\n", __func__);
+       if (xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP))
+               pr_info("%s: can't deregister protocol\n", __func__);
+       if (xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH))
+               pr_info("%s: can't deregister protocol\n", __func__);
+       if (xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP))
+               pr_info("%s: can't deregister protocol\n", __func__);
 
        unregister_pernet_device(&vti6_net_ops);
 }
index da9becb42e8127283f59c70731dc2890701f769e..d1c793cffcb5f44aba0f922f2ebdd3105d51b49f 100644 (file)
@@ -53,7 +53,7 @@
 #include <linux/icmpv6.h>
 #include <linux/mutex.h>
 
-static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                                u8 type, u8 code, int offset, __be32 info)
 {
        struct net *net = dev_net(skb->dev);
@@ -65,19 +65,21 @@ static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
        if (type != ICMPV6_PKT_TOOBIG &&
            type != NDISC_REDIRECT)
-               return;
+               return 0;
 
        spi = htonl(ntohs(ipcomph->cpi));
        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              spi, IPPROTO_COMP, AF_INET6);
        if (!x)
-               return;
+               return 0;
 
        if (type == NDISC_REDIRECT)
                ip6_redirect(skb, net, skb->dev->ifindex, 0);
        else
                ip6_update_pmtu(skb, net, info, 0, 0);
        xfrm_state_put(x);
+
+       return 0;
 }
 
 static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
@@ -174,6 +176,11 @@ out:
        return err;
 }
 
+static int ipcomp6_rcv_cb(struct sk_buff *skb, int err)
+{
+       return 0;
+}
+
 static const struct xfrm_type ipcomp6_type =
 {
        .description    = "IPCOMP6",
@@ -186,11 +193,12 @@ static const struct xfrm_type ipcomp6_type =
        .hdr_offset     = xfrm6_find_1stfragopt,
 };
 
-static const struct inet6_protocol ipcomp6_protocol =
+static struct xfrm6_protocol ipcomp6_protocol =
 {
        .handler        = xfrm6_rcv,
+       .cb_handler     = ipcomp6_rcv_cb,
        .err_handler    = ipcomp6_err,
-       .flags          = INET6_PROTO_NOPOLICY,
+       .priority       = 0,
 };
 
 static int __init ipcomp6_init(void)
@@ -199,7 +207,7 @@ static int __init ipcomp6_init(void)
                pr_info("%s: can't add xfrm type\n", __func__);
                return -EAGAIN;
        }
-       if (inet6_add_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) {
+       if (xfrm6_protocol_register(&ipcomp6_protocol, IPPROTO_COMP) < 0) {
                pr_info("%s: can't add protocol\n", __func__);
                xfrm_unregister_type(&ipcomp6_type, AF_INET6);
                return -EAGAIN;
@@ -209,7 +217,7 @@ static int __init ipcomp6_init(void)
 
 static void __exit ipcomp6_fini(void)
 {
-       if (inet6_del_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0)
+       if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0)
                pr_info("%s: can't remove protocol\n", __func__);
        if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0)
                pr_info("%s: can't remove xfrm type\n", __func__);
index cb04f7a16b5e102f2944be051d61511d5645d3f7..901ef6f8addc0cf730909d2656513372cd6cf80f 100644 (file)
 #include <net/ipv6.h>
 #include <net/xfrm.h>
 
-/* Informational hook. The decap is still done here. */
-static struct xfrm_tunnel_notifier __rcu *rcv_notify_handlers __read_mostly;
-static DEFINE_MUTEX(xfrm6_mode_tunnel_input_mutex);
-
-int xfrm6_mode_tunnel_input_register(struct xfrm_tunnel_notifier *handler)
-{
-       struct xfrm_tunnel_notifier __rcu **pprev;
-       struct xfrm_tunnel_notifier *t;
-       int ret = -EEXIST;
-       int priority = handler->priority;
-
-       mutex_lock(&xfrm6_mode_tunnel_input_mutex);
-
-       for (pprev = &rcv_notify_handlers;
-            (t = rcu_dereference_protected(*pprev,
-            lockdep_is_held(&xfrm6_mode_tunnel_input_mutex))) != NULL;
-            pprev = &t->next) {
-               if (t->priority > priority)
-                       break;
-               if (t->priority == priority)
-                       goto err;
-
-       }
-
-       handler->next = *pprev;
-       rcu_assign_pointer(*pprev, handler);
-
-       ret = 0;
-
-err:
-       mutex_unlock(&xfrm6_mode_tunnel_input_mutex);
-       return ret;
-}
-EXPORT_SYMBOL_GPL(xfrm6_mode_tunnel_input_register);
-
-int xfrm6_mode_tunnel_input_deregister(struct xfrm_tunnel_notifier *handler)
-{
-       struct xfrm_tunnel_notifier __rcu **pprev;
-       struct xfrm_tunnel_notifier *t;
-       int ret = -ENOENT;
-
-       mutex_lock(&xfrm6_mode_tunnel_input_mutex);
-       for (pprev = &rcv_notify_handlers;
-            (t = rcu_dereference_protected(*pprev,
-            lockdep_is_held(&xfrm6_mode_tunnel_input_mutex))) != NULL;
-            pprev = &t->next) {
-               if (t == handler) {
-                       *pprev = handler->next;
-                       ret = 0;
-                       break;
-               }
-       }
-       mutex_unlock(&xfrm6_mode_tunnel_input_mutex);
-       synchronize_net();
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(xfrm6_mode_tunnel_input_deregister);
-
 static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
 {
        const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
@@ -130,7 +71,6 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 
 static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
 {
-       struct xfrm_tunnel_notifier *handler;
        int err = -EINVAL;
 
        if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6)
@@ -138,9 +78,6 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
        if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
                goto out;
 
-       for_each_input_rcu(rcv_notify_handlers, handler)
-               handler->handler(skb);
-
        err = skb_unclone(skb, GFP_ATOMIC);
        if (err)
                goto out;
index 5f8e128c512d664080251bf6958c89021a1110c1..2a0bbda2c76a99dfa687313d230595c251103b45 100644 (file)
@@ -389,11 +389,17 @@ int __init xfrm6_init(void)
        if (ret)
                goto out_policy;
 
+       ret = xfrm6_protocol_init();
+       if (ret)
+               goto out_state;
+
 #ifdef CONFIG_SYSCTL
        register_pernet_subsys(&xfrm6_net_ops);
 #endif
 out:
        return ret;
+out_state:
+       xfrm6_state_fini();
 out_policy:
        xfrm6_policy_fini();
        goto out;
@@ -404,6 +410,7 @@ void xfrm6_fini(void)
 #ifdef CONFIG_SYSCTL
        unregister_pernet_subsys(&xfrm6_net_ops);
 #endif
+       xfrm6_protocol_fini();
        xfrm6_policy_fini();
        xfrm6_state_fini();
        dst_entries_destroy(&xfrm6_dst_ops);
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
new file mode 100644 (file)
index 0000000..6ab989c
--- /dev/null
@@ -0,0 +1,270 @@
+/* xfrm6_protocol.c - Generic xfrm protocol multiplexer for ipv6.
+ *
+ * Copyright (C) 2013 secunet Security Networks AG
+ *
+ * Author:
+ * Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * Based on:
+ * net/ipv4/xfrm4_protocol.c
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License
+ *     as published by the Free Software Foundation; either version
+ *     2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/skbuff.h>
+#include <linux/icmpv6.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+static struct xfrm6_protocol __rcu *esp6_handlers __read_mostly;
+static struct xfrm6_protocol __rcu *ah6_handlers __read_mostly;
+static struct xfrm6_protocol __rcu *ipcomp6_handlers __read_mostly;
+static DEFINE_MUTEX(xfrm6_protocol_mutex);
+
+static inline struct xfrm6_protocol __rcu **proto_handlers(u8 protocol)
+{
+       switch (protocol) {
+       case IPPROTO_ESP:
+               return &esp6_handlers;
+       case IPPROTO_AH:
+               return &ah6_handlers;
+       case IPPROTO_COMP:
+               return &ipcomp6_handlers;
+       }
+
+       return NULL;
+}
+
+#define for_each_protocol_rcu(head, handler)           \
+       for (handler = rcu_dereference(head);           \
+            handler != NULL;                           \
+            handler = rcu_dereference(handler->next))  \
+
+int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
+{
+       int ret;
+       struct xfrm6_protocol *handler;
+
+       for_each_protocol_rcu(*proto_handlers(protocol), handler)
+               if ((ret = handler->cb_handler(skb, err)) <= 0)
+                       return ret;
+
+       return 0;
+}
+EXPORT_SYMBOL(xfrm6_rcv_cb);
+
+static int xfrm6_esp_rcv(struct sk_buff *skb)
+{
+       int ret;
+       struct xfrm6_protocol *handler;
+
+       XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+
+       for_each_protocol_rcu(esp6_handlers, handler)
+               if ((ret = handler->handler(skb)) != -EINVAL)
+                       return ret;
+
+       icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+       kfree_skb(skb);
+       return 0;
+}
+
+static void xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+                         u8 type, u8 code, int offset, __be32 info)
+{
+       struct xfrm6_protocol *handler;
+
+       for_each_protocol_rcu(esp6_handlers, handler)
+               if (!handler->err_handler(skb, opt, type, code, offset, info))
+                       break;
+}
+
+static int xfrm6_ah_rcv(struct sk_buff *skb)
+{
+       int ret;
+       struct xfrm6_protocol *handler;
+
+       XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+
+       for_each_protocol_rcu(ah6_handlers, handler)
+               if ((ret = handler->handler(skb)) != -EINVAL)
+                       return ret;
+
+       icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+       kfree_skb(skb);
+       return 0;
+}
+
+static void xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+                        u8 type, u8 code, int offset, __be32 info)
+{
+       struct xfrm6_protocol *handler;
+
+       for_each_protocol_rcu(ah6_handlers, handler)
+               if (!handler->err_handler(skb, opt, type, code, offset, info))
+                       break;
+}
+
+static int xfrm6_ipcomp_rcv(struct sk_buff *skb)
+{
+       int ret;
+       struct xfrm6_protocol *handler;
+
+       XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+
+       for_each_protocol_rcu(ipcomp6_handlers, handler)
+               if ((ret = handler->handler(skb)) != -EINVAL)
+                       return ret;
+
+       icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+       kfree_skb(skb);
+       return 0;
+}
+
+static void xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+                            u8 type, u8 code, int offset, __be32 info)
+{
+       struct xfrm6_protocol *handler;
+
+       for_each_protocol_rcu(ipcomp6_handlers, handler)
+               if (!handler->err_handler(skb, opt, type, code, offset, info))
+                       break;
+}
+
+static const struct inet6_protocol esp6_protocol = {
+       .handler        =       xfrm6_esp_rcv,
+       .err_handler    =       xfrm6_esp_err,
+       .flags          =       INET6_PROTO_NOPOLICY,
+};
+
+static const struct inet6_protocol ah6_protocol = {
+       .handler        =       xfrm6_ah_rcv,
+       .err_handler    =       xfrm6_ah_err,
+       .flags          =       INET6_PROTO_NOPOLICY,
+};
+
+static const struct inet6_protocol ipcomp6_protocol = {
+       .handler        =       xfrm6_ipcomp_rcv,
+       .err_handler    =       xfrm6_ipcomp_err,
+       .flags          =       INET6_PROTO_NOPOLICY,
+};
+
+static struct xfrm_input_afinfo xfrm6_input_afinfo = {
+       .family         =       AF_INET6,
+       .owner          =       THIS_MODULE,
+       .callback       =       xfrm6_rcv_cb,
+};
+
+static inline const struct inet6_protocol *netproto(unsigned char protocol)
+{
+       switch (protocol) {
+       case IPPROTO_ESP:
+               return &esp6_protocol;
+       case IPPROTO_AH:
+               return &ah6_protocol;
+       case IPPROTO_COMP:
+               return &ipcomp6_protocol;
+       }
+
+       return NULL;
+}
+
+int xfrm6_protocol_register(struct xfrm6_protocol *handler,
+                           unsigned char protocol)
+{
+       struct xfrm6_protocol __rcu **pprev;
+       struct xfrm6_protocol *t;
+       bool add_netproto = false;
+
+       int ret = -EEXIST;
+       int priority = handler->priority;
+
+       mutex_lock(&xfrm6_protocol_mutex);
+
+       if (!rcu_dereference_protected(*proto_handlers(protocol),
+                                      lockdep_is_held(&xfrm6_protocol_mutex)))
+               add_netproto = true;
+
+       for (pprev = proto_handlers(protocol);
+            (t = rcu_dereference_protected(*pprev,
+                       lockdep_is_held(&xfrm6_protocol_mutex))) != NULL;
+            pprev = &t->next) {
+               if (t->priority < priority)
+                       break;
+               if (t->priority == priority)
+                       goto err;
+       }
+
+       handler->next = *pprev;
+       rcu_assign_pointer(*pprev, handler);
+
+       ret = 0;
+
+err:
+       mutex_unlock(&xfrm6_protocol_mutex);
+
+       if (add_netproto) {
+               if (inet6_add_protocol(netproto(protocol), protocol)) {
+                       pr_err("%s: can't add protocol\n", __func__);
+                       ret = -EAGAIN;
+               }
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL(xfrm6_protocol_register);
+
+int xfrm6_protocol_deregister(struct xfrm6_protocol *handler,
+                             unsigned char protocol)
+{
+       struct xfrm6_protocol __rcu **pprev;
+       struct xfrm6_protocol *t;
+       int ret = -ENOENT;
+
+       mutex_lock(&xfrm6_protocol_mutex);
+
+       for (pprev = proto_handlers(protocol);
+            (t = rcu_dereference_protected(*pprev,
+                       lockdep_is_held(&xfrm6_protocol_mutex))) != NULL;
+            pprev = &t->next) {
+               if (t == handler) {
+                       *pprev = handler->next;
+                       ret = 0;
+                       break;
+               }
+       }
+
+       if (!rcu_dereference_protected(*proto_handlers(protocol),
+                                      lockdep_is_held(&xfrm6_protocol_mutex))) {
+               if (inet6_del_protocol(netproto(protocol), protocol) < 0) {
+                       pr_err("%s: can't remove protocol\n", __func__);
+                       ret = -EAGAIN;
+               }
+       }
+
+       mutex_unlock(&xfrm6_protocol_mutex);
+
+       synchronize_net();
+
+       return ret;
+}
+EXPORT_SYMBOL(xfrm6_protocol_deregister);
+
+int __init xfrm6_protocol_init(void)
+{
+       return xfrm_input_register_afinfo(&xfrm6_input_afinfo);
+}
+
+void xfrm6_protocol_fini(void)
+{
+       xfrm_input_unregister_afinfo(&xfrm6_input_afinfo);
+}
index a50d979b592660aacfe711b9c73d93da00c7178a..12651b42aad894c67317cca308bcb89f4c16cd67 100644 (file)
@@ -1799,7 +1799,7 @@ static void pfkey_dump_sa_done(struct pfkey_sock *pfk)
 static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
 {
        u8 proto;
-       struct xfrm_filter *filter = NULL;
+       struct xfrm_address_filter *filter = NULL;
        struct pfkey_sock *pfk = pfkey_sk(sk);
 
        if (pfk->dump.dump != NULL)
index 44cd4f58adf08b914b871d7399942fb58168a623..2f7f5c32c6f90a0eb376d7921aecf167564329ce 100644 (file)
@@ -61,6 +61,15 @@ config IP_SET_HASH_IP
 
          To compile it as a module, choose M here.  If unsure, say N.
 
+config IP_SET_HASH_IPMARK
+       tristate "hash:ip,mark set support"
+       depends on IP_SET
+       help
+         This option adds the hash:ip,mark set type support, by which one
+         can store IPv4/IPv6 address and mark pairs.
+
+         To compile it as a module, choose M here.  If unsure, say N.
+
 config IP_SET_HASH_IPPORT
        tristate "hash:ip,port set support"
        depends on IP_SET
index 44b2d38476faeb75a95c5ac8348330a99aa866fd..231f10196cb906fd4cbbe5f58263c5a3ba97886a 100644 (file)
@@ -14,6 +14,7 @@ obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o
 
 # hash types
 obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o
+obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o
 obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o
 obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o
 obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o
index de770ec39e5112e4cbf5b4b90629144bad1d957b..117208321f16997af9f24bee1f86519f8f8fa26e 100644 (file)
@@ -54,10 +54,10 @@ MODULE_DESCRIPTION("core IP set support");
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
 
 /* When the nfnl mutex is held: */
-#define nfnl_dereference(p)            \
+#define ip_set_dereference(p)          \
        rcu_dereference_protected(p, 1)
-#define nfnl_set(inst, id)                     \
-       nfnl_dereference((inst)->ip_set_list)[id]
+#define ip_set(inst, id)               \
+       ip_set_dereference((inst)->ip_set_list)[id]
 
 /*
  * The set types are implemented in modules and registered set types
@@ -368,6 +368,8 @@ ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len)
 
        if (tb[IPSET_ATTR_CADT_FLAGS])
                cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+       if (cadt_flags & IPSET_FLAG_WITH_FORCEADD)
+               set->flags |= IPSET_CREATE_FLAG_FORCEADD;
        for (id = 0; id < IPSET_EXT_ID_MAX; id++) {
                if (!add_extension(id, cadt_flags, tb))
                        continue;
@@ -510,7 +512,7 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
 
        if (opt->dim < set->type->dimension ||
            !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
-               return 0;
+               return -IPSET_ERR_TYPE_MISMATCH;
 
        write_lock_bh(&set->lock);
        ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
@@ -533,7 +535,7 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
 
        if (opt->dim < set->type->dimension ||
            !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
-               return 0;
+               return -IPSET_ERR_TYPE_MISMATCH;
 
        write_lock_bh(&set->lock);
        ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
@@ -640,7 +642,7 @@ ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index)
                return IPSET_INVALID_ID;
 
        nfnl_lock(NFNL_SUBSYS_IPSET);
-       set = nfnl_set(inst, index);
+       set = ip_set(inst, index);
        if (set)
                __ip_set_get(set);
        else
@@ -666,7 +668,7 @@ ip_set_nfnl_put(struct net *net, ip_set_id_t index)
 
        nfnl_lock(NFNL_SUBSYS_IPSET);
        if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */
-               set = nfnl_set(inst, index);
+               set = ip_set(inst, index);
                if (set != NULL)
                        __ip_set_put(set);
        }
@@ -734,7 +736,7 @@ find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id)
 
        *id = IPSET_INVALID_ID;
        for (i = 0; i < inst->ip_set_max; i++) {
-               set = nfnl_set(inst, i);
+               set = ip_set(inst, i);
                if (set != NULL && STREQ(set->name, name)) {
                        *id = i;
                        break;
@@ -760,7 +762,7 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index,
 
        *index = IPSET_INVALID_ID;
        for (i = 0;  i < inst->ip_set_max; i++) {
-               s = nfnl_set(inst, i);
+               s = ip_set(inst, i);
                if (s == NULL) {
                        if (*index == IPSET_INVALID_ID)
                                *index = i;
@@ -883,7 +885,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
                if (!list)
                        goto cleanup;
                /* nfnl mutex is held, both lists are valid */
-               tmp = nfnl_dereference(inst->ip_set_list);
+               tmp = ip_set_dereference(inst->ip_set_list);
                memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max);
                rcu_assign_pointer(inst->ip_set_list, list);
                /* Make sure all current packets have passed through */
@@ -900,7 +902,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
         * Finally! Add our shiny new set to the list, and be done.
         */
        pr_debug("create: '%s' created with index %u!\n", set->name, index);
-       nfnl_set(inst, index) = set;
+       ip_set(inst, index) = set;
 
        return ret;
 
@@ -925,10 +927,10 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
 static void
 ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index)
 {
-       struct ip_set *set = nfnl_set(inst, index);
+       struct ip_set *set = ip_set(inst, index);
 
        pr_debug("set: %s\n",  set->name);
-       nfnl_set(inst, index) = NULL;
+       ip_set(inst, index) = NULL;
 
        /* Must call it without holding any lock */
        set->variant->destroy(set);
@@ -962,7 +964,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
        read_lock_bh(&ip_set_ref_lock);
        if (!attr[IPSET_ATTR_SETNAME]) {
                for (i = 0; i < inst->ip_set_max; i++) {
-                       s = nfnl_set(inst, i);
+                       s = ip_set(inst, i);
                        if (s != NULL && s->ref) {
                                ret = -IPSET_ERR_BUSY;
                                goto out;
@@ -970,7 +972,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
                }
                read_unlock_bh(&ip_set_ref_lock);
                for (i = 0; i < inst->ip_set_max; i++) {
-                       s = nfnl_set(inst, i);
+                       s = ip_set(inst, i);
                        if (s != NULL)
                                ip_set_destroy_set(inst, i);
                }
@@ -1020,7 +1022,7 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
 
        if (!attr[IPSET_ATTR_SETNAME]) {
                for (i = 0; i < inst->ip_set_max; i++) {
-                       s = nfnl_set(inst, i);
+                       s = ip_set(inst, i);
                        if (s != NULL)
                                ip_set_flush_set(s);
                }
@@ -1074,7 +1076,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
 
        name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
        for (i = 0; i < inst->ip_set_max; i++) {
-               s = nfnl_set(inst, i);
+               s = ip_set(inst, i);
                if (s != NULL && STREQ(s->name, name2)) {
                        ret = -IPSET_ERR_EXIST_SETNAME2;
                        goto out;
@@ -1134,8 +1136,8 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
 
        write_lock_bh(&ip_set_ref_lock);
        swap(from->ref, to->ref);
-       nfnl_set(inst, from_id) = to;
-       nfnl_set(inst, to_id) = from;
+       ip_set(inst, from_id) = to;
+       ip_set(inst, to_id) = from;
        write_unlock_bh(&ip_set_ref_lock);
 
        return 0;
@@ -1157,7 +1159,7 @@ ip_set_dump_done(struct netlink_callback *cb)
        struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET];
        if (cb->args[IPSET_CB_ARG0]) {
                pr_debug("release set %s\n",
-                        nfnl_set(inst, cb->args[IPSET_CB_INDEX])->name);
+                        ip_set(inst, cb->args[IPSET_CB_INDEX])->name);
                __ip_set_put_byindex(inst,
                        (ip_set_id_t) cb->args[IPSET_CB_INDEX]);
        }
@@ -1254,7 +1256,7 @@ dump_last:
                 dump_type, dump_flags, cb->args[IPSET_CB_INDEX]);
        for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) {
                index = (ip_set_id_t) cb->args[IPSET_CB_INDEX];
-               set = nfnl_set(inst, index);
+               set = ip_set(inst, index);
                if (set == NULL) {
                        if (dump_type == DUMP_ONE) {
                                ret = -ENOENT;
@@ -1332,7 +1334,7 @@ next_set:
 release_refcount:
        /* If there was an error or set is done, release set */
        if (ret || !cb->args[IPSET_CB_ARG0]) {
-               pr_debug("release set %s\n", nfnl_set(inst, index)->name);
+               pr_debug("release set %s\n", ip_set(inst, index)->name);
                __ip_set_put_byindex(inst, index);
                cb->args[IPSET_CB_ARG0] = 0;
        }
@@ -1887,7 +1889,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
                find_set_and_id(inst, req_get->set.name, &id);
                req_get->set.index = id;
                if (id != IPSET_INVALID_ID)
-                       req_get->family = nfnl_set(inst, id)->family;
+                       req_get->family = ip_set(inst, id)->family;
                nfnl_unlock(NFNL_SUBSYS_IPSET);
                goto copy;
        }
@@ -1901,7 +1903,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
                        goto done;
                }
                nfnl_lock(NFNL_SUBSYS_IPSET);
-               set = nfnl_set(inst, req_get->set.index);
+               set = ip_set(inst, req_get->set.index);
                strncpy(req_get->set.name, set ? set->name : "",
                        IPSET_MAXNAMELEN);
                nfnl_unlock(NFNL_SUBSYS_IPSET);
@@ -1945,7 +1947,6 @@ ip_set_net_init(struct net *net)
                return -ENOMEM;
        inst->is_deleted = 0;
        rcu_assign_pointer(inst->ip_set_list, list);
-       pr_notice("ip_set: protocol %u\n", IPSET_PROTOCOL);
        return 0;
 }
 
@@ -1960,7 +1961,7 @@ ip_set_net_exit(struct net *net)
        inst->is_deleted = 1; /* flag for ip_set_nfnl_put */
 
        for (i = 0; i < inst->ip_set_max; i++) {
-               set = nfnl_set(inst, i);
+               set = ip_set(inst, i);
                if (set != NULL)
                        ip_set_destroy_set(inst, i);
        }
@@ -1996,6 +1997,7 @@ ip_set_init(void)
                nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
                return ret;
        }
+       pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL);
        return 0;
 }
 
index be6932ad3a8626d6c06e99091b32f540b7afd338..61c7fb052802e7c0cb291289ee29c484b4f84179 100644 (file)
@@ -263,6 +263,9 @@ struct htype {
        u32 maxelem;            /* max elements in the hash */
        u32 elements;           /* current element (vs timeout) */
        u32 initval;            /* random jhash init value */
+#ifdef IP_SET_HASH_WITH_MARKMASK
+       u32 markmask;           /* markmask value for mark mask to store */
+#endif
        struct timer_list gc;   /* garbage collection when timeout enabled */
        struct mtype_elem next; /* temporary storage for uadd */
 #ifdef IP_SET_HASH_WITH_MULTI
@@ -453,6 +456,9 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
               a->timeout == b->timeout &&
 #ifdef IP_SET_HASH_WITH_NETMASK
               x->netmask == y->netmask &&
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+              x->markmask == y->markmask &&
 #endif
               a->extensions == b->extensions;
 }
@@ -627,6 +633,18 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
        bool flag_exist = flags & IPSET_FLAG_EXIST;
        u32 key, multi = 0;
 
+       if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set)) {
+               rcu_read_lock_bh();
+               t = rcu_dereference_bh(h->table);
+               key = HKEY(value, h->initval, t->htable_bits);
+               n = hbucket(t,key);
+               if (n->pos) {
+                       /* Choosing the first entry in the array to replace */
+                       j = 0;
+                       goto reuse_slot;
+               }
+               rcu_read_unlock_bh();
+       }
        if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem)
                /* FIXME: when set is full, we slow down here */
                mtype_expire(set, h, NLEN(set->family), set->dsize);
@@ -907,6 +925,10 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
        if (h->netmask != HOST_MASK &&
            nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
                goto nla_put_failure;
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+       if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask))
+               goto nla_put_failure;
 #endif
        if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
            nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
@@ -1016,6 +1038,9 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
                            struct nlattr *tb[], u32 flags)
 {
        u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+#ifdef IP_SET_HASH_WITH_MARKMASK
+       u32 markmask;
+#endif
        u8 hbits;
 #ifdef IP_SET_HASH_WITH_NETMASK
        u8 netmask;
@@ -1026,6 +1051,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 
        if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
                return -IPSET_ERR_INVALID_FAMILY;
+
+#ifdef IP_SET_HASH_WITH_MARKMASK
+       markmask = 0xffffffff;
+#endif
 #ifdef IP_SET_HASH_WITH_NETMASK
        netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
        pr_debug("Create set %s with family %s\n",
@@ -1034,6 +1063,9 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 
        if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
                     !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+#ifdef IP_SET_HASH_WITH_MARKMASK
+                    !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) ||
+#endif
                     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
                     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
                return -IPSET_ERR_PROTOCOL;
@@ -1057,6 +1089,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
                        return -IPSET_ERR_INVALID_NETMASK;
        }
 #endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+       if (tb[IPSET_ATTR_MARKMASK]) {
+               markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK]));
+
+               if ((markmask > 4294967295u) || markmask == 0)
+                       return -IPSET_ERR_INVALID_MARKMASK;
+       }
+#endif
 
        hsize = sizeof(*h);
 #ifdef IP_SET_HASH_WITH_NETS
@@ -1070,6 +1110,9 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
        h->maxelem = maxelem;
 #ifdef IP_SET_HASH_WITH_NETMASK
        h->netmask = netmask;
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+       h->markmask = markmask;
 #endif
        get_random_bytes(&h->initval, sizeof(h->initval));
        set->timeout = IPSET_NO_TIMEOUT;
index e65fc2423d56dd2b21cee513786eec41ceabefc9..dd40607f878e28e8c5c411a24fb8e36ef7db5d46 100644 (file)
@@ -25,7 +25,8 @@
 
 #define IPSET_TYPE_REV_MIN     0
 /*                             1          Counters support */
-#define IPSET_TYPE_REV_MAX     2       /* Comments support */
+/*                             2          Comments support */
+#define IPSET_TYPE_REV_MAX     3       /* Forceadd support */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
new file mode 100644 (file)
index 0000000..4eff0a2
--- /dev/null
@@ -0,0 +1,321 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,mark type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN     0
+#define IPSET_TYPE_REV_MAX     1       /* Forceadd support */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>");
+IP_SET_MODULE_DESC("hash:ip,mark", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip,mark");
+
+/* Type specific function prefix */
+#define HTYPE          hash_ipmark
+#define IP_SET_HASH_WITH_MARKMASK
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ipmark4_elem {
+       __be32 ip;
+       __u32 mark;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1,
+                       const struct hash_ipmark4_elem *ip2,
+                       u32 *multi)
+{
+       return ip1->ip == ip2->ip &&
+              ip1->mark == ip2->mark;
+}
+
+static bool
+hash_ipmark4_data_list(struct sk_buff *skb,
+                      const struct hash_ipmark4_elem *data)
+{
+       if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+           nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark)))
+               goto nla_put_failure;
+       return 0;
+
+nla_put_failure:
+       return 1;
+}
+
+static inline void
+hash_ipmark4_data_next(struct hash_ipmark4_elem *next,
+                      const struct hash_ipmark4_elem *d)
+{
+       next->ip = d->ip;
+}
+
+#define MTYPE           hash_ipmark4
+#define PF              4
+#define HOST_MASK       32
+#define HKEY_DATALEN   sizeof(struct hash_ipmark4_elem)
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb,
+                 const struct xt_action_param *par,
+                 enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+       const struct hash_ipmark *h = set->data;
+       ipset_adtfn adtfn = set->variant->adt[adt];
+       struct hash_ipmark4_elem e = { };
+       struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+       e.mark = skb->mark;
+       e.mark &= h->markmask;
+
+       ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+       return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
+                 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+       const struct hash_ipmark *h = set->data;
+       ipset_adtfn adtfn = set->variant->adt[adt];
+       struct hash_ipmark4_elem e = { };
+       struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+       u32 ip, ip_to = 0;
+       int ret;
+
+       if (unlikely(!tb[IPSET_ATTR_IP] ||
+                    !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) ||
+                    !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+                    !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+                    !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
+               return -IPSET_ERR_PROTOCOL;
+
+       if (tb[IPSET_ATTR_LINENO])
+               *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+       ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
+             ip_set_get_extensions(set, tb, &ext);
+       if (ret)
+               return ret;
+
+       e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK]));
+       e.mark &= h->markmask;
+
+       if (adt == IPSET_TEST ||
+           !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) {
+               ret = adtfn(set, &e, &ext, &ext, flags);
+               return ip_set_eexist(ret, flags) ? 0 : ret;
+       }
+
+       ip_to = ip = ntohl(e.ip);
+       if (tb[IPSET_ATTR_IP_TO]) {
+               ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+               if (ret)
+                       return ret;
+               if (ip > ip_to)
+                       swap(ip, ip_to);
+       } else if (tb[IPSET_ATTR_CIDR]) {
+               u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+               if (!cidr || cidr > 32)
+                       return -IPSET_ERR_INVALID_CIDR;
+               ip_set_mask_from_to(ip, ip_to, cidr);
+       }
+
+       if (retried)
+               ip = ntohl(h->next.ip);
+       for (; !before(ip_to, ip); ip++) {
+               e.ip = htonl(ip);
+               ret = adtfn(set, &e, &ext, &ext, flags);
+
+               if (ret && !ip_set_eexist(ret, flags))
+                       return ret;
+               else
+                       ret = 0;
+       }
+       return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_ipmark6_elem {
+       union nf_inet_addr ip;
+       __u32 mark;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1,
+                       const struct hash_ipmark6_elem *ip2,
+                       u32 *multi)
+{
+       return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+              ip1->mark == ip2->mark;
+}
+
+static bool
+hash_ipmark6_data_list(struct sk_buff *skb,
+                      const struct hash_ipmark6_elem *data)
+{
+       if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+           nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark)))
+               goto nla_put_failure;
+       return 0;
+
+nla_put_failure:
+       return 1;
+}
+
+static inline void
+hash_ipmark6_data_next(struct hash_ipmark4_elem *next,
+                      const struct hash_ipmark6_elem *d)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+#undef HKEY_DATALEN
+
+#define MTYPE          hash_ipmark6
+#define PF             6
+#define HOST_MASK      128
+#define HKEY_DATALEN   sizeof(struct hash_ipmark6_elem)
+#define        IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+
+static int
+hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb,
+                 const struct xt_action_param *par,
+                 enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+       const struct hash_ipmark *h = set->data;
+       ipset_adtfn adtfn = set->variant->adt[adt];
+       struct hash_ipmark6_elem e = { };
+       struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+       e.mark = skb->mark;
+       e.mark &= h->markmask;
+
+       ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+       return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[],
+                 enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+       const struct hash_ipmark *h = set->data;
+       ipset_adtfn adtfn = set->variant->adt[adt];
+       struct hash_ipmark6_elem e = { };
+       struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+       int ret;
+
+       if (unlikely(!tb[IPSET_ATTR_IP] ||
+                    !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) ||
+                    !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+                    !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+                    !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+                    tb[IPSET_ATTR_IP_TO] ||
+                    tb[IPSET_ATTR_CIDR]))
+               return -IPSET_ERR_PROTOCOL;
+
+       if (tb[IPSET_ATTR_LINENO])
+               *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+       ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+             ip_set_get_extensions(set, tb, &ext);
+       if (ret)
+               return ret;
+
+       e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK]));
+       e.mark &= h->markmask;
+
+       if (adt == IPSET_TEST) {
+               ret = adtfn(set, &e, &ext, &ext, flags);
+               return ip_set_eexist(ret, flags) ? 0 : ret;
+       }
+
+       ret = adtfn(set, &e, &ext, &ext, flags);
+       if (ret && !ip_set_eexist(ret, flags))
+               return ret;
+       else
+               ret = 0;
+
+       return ret;
+}
+
+static struct ip_set_type hash_ipmark_type __read_mostly = {
+       .name           = "hash:ip,mark",
+       .protocol       = IPSET_PROTOCOL,
+       .features       = IPSET_TYPE_IP | IPSET_TYPE_MARK,
+       .dimension      = IPSET_DIM_TWO,
+       .family         = NFPROTO_UNSPEC,
+       .revision_min   = IPSET_TYPE_REV_MIN,
+       .revision_max   = IPSET_TYPE_REV_MAX,
+       .create         = hash_ipmark_create,
+       .create_policy  = {
+               [IPSET_ATTR_MARKMASK]   = { .type = NLA_U32 },
+               [IPSET_ATTR_HASHSIZE]   = { .type = NLA_U32 },
+               [IPSET_ATTR_MAXELEM]    = { .type = NLA_U32 },
+               [IPSET_ATTR_PROBES]     = { .type = NLA_U8 },
+               [IPSET_ATTR_RESIZE]     = { .type = NLA_U8  },
+               [IPSET_ATTR_TIMEOUT]    = { .type = NLA_U32 },
+               [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+       },
+       .adt_policy     = {
+               [IPSET_ATTR_IP]         = { .type = NLA_NESTED },
+               [IPSET_ATTR_IP_TO]      = { .type = NLA_NESTED },
+               [IPSET_ATTR_MARK]       = { .type = NLA_U32 },
+               [IPSET_ATTR_CIDR]       = { .type = NLA_U8 },
+               [IPSET_ATTR_TIMEOUT]    = { .type = NLA_U32 },
+               [IPSET_ATTR_LINENO]     = { .type = NLA_U32 },
+               [IPSET_ATTR_BYTES]      = { .type = NLA_U64 },
+               [IPSET_ATTR_PACKETS]    = { .type = NLA_U64 },
+               [IPSET_ATTR_COMMENT]    = { .type = NLA_NUL_STRING },
+       },
+       .me             = THIS_MODULE,
+};
+
+static int __init
+hash_ipmark_init(void)
+{
+       return ip_set_type_register(&hash_ipmark_type);
+}
+
+static void __exit
+hash_ipmark_fini(void)
+{
+       ip_set_type_unregister(&hash_ipmark_type);
+}
+
+module_init(hash_ipmark_init);
+module_exit(hash_ipmark_fini);
index 525a595dd1fe4bf0efe6db7ca9cc06d995c66430..7597b82a8b033bd37c8a5f64dae02e8aa9137a19 100644 (file)
@@ -27,7 +27,8 @@
 #define IPSET_TYPE_REV_MIN     0
 /*                             1    SCTP and UDPLITE support added */
 /*                             2    Counters support added */
-#define IPSET_TYPE_REV_MAX     3 /* Comments support added */
+/*                             3    Comments support added */
+#define IPSET_TYPE_REV_MAX     4 /* Forceadd support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
index f5636631466eb3ee98509e8b832e46da4ad9a417..672655ffd573404b2f2f5bf097b7a8d01f0c4c14 100644 (file)
@@ -27,7 +27,8 @@
 #define IPSET_TYPE_REV_MIN     0
 /*                             1    SCTP and UDPLITE support added */
 /*                             2    Counters support added */
-#define IPSET_TYPE_REV_MAX     3 /* Comments support added */
+/*                             3    Comments support added */
+#define IPSET_TYPE_REV_MAX     4 /* Forceadd support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
index 5d87fe8a41ffa4888a70b6d91897360e30c253bc..7308d84f9277813f16351b692f75218ff1a451b3 100644 (file)
@@ -29,7 +29,8 @@
 /*                             2    Range as input support for IPv4 added */
 /*                             3    nomatch flag support added */
 /*                             4    Counters support added */
-#define IPSET_TYPE_REV_MAX     5 /* Comments support added */
+/*                             5    Comments support added */
+#define IPSET_TYPE_REV_MAX     6 /* Forceadd support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
index 8295cf4f9fdcfdb3d3da4b72792add9fc83156ad..4c7d495783a3aefa9447d4b37114bd3715928b0a 100644 (file)
@@ -26,7 +26,8 @@
 /*                             1    Range as input support for IPv4 added */
 /*                             2    nomatch flag support added */
 /*                             3    Counters support added */
-#define IPSET_TYPE_REV_MAX     4 /* Comments support added */
+/*                             4    Comments support added */
+#define IPSET_TYPE_REV_MAX     5 /* Forceadd support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
index b827a0f1f3510bc0fa4e6b0a498738806ac62e81..db2606805b3575b91a87cded299222c571fbf6ba 100644 (file)
@@ -27,7 +27,8 @@
 /*                             1    nomatch flag support added */
 /*                             2    /0 support added */
 /*                             3    Counters support added */
-#define IPSET_TYPE_REV_MAX     4 /* Comments support added */
+/*                             4    Comments support added */
+#define IPSET_TYPE_REV_MAX     5 /* Forceadd support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
index 6226803fc490ce33c53994c344a1c034b9b0cce9..3e99987e4bf248f6d6085118dba52a4c24ee108a 100644 (file)
@@ -24,7 +24,7 @@
 #include <linux/netfilter/ipset/ip_set_hash.h>
 
 #define IPSET_TYPE_REV_MIN     0
-#define IPSET_TYPE_REV_MAX     0
+#define IPSET_TYPE_REV_MAX     1       /* Forceadd support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
@@ -112,10 +112,10 @@ hash_netnet4_data_list(struct sk_buff *skb,
            (flags &&
             nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
                goto nla_put_failure;
-       return 0;
+       return false;
 
 nla_put_failure:
-       return 1;
+       return true;
 }
 
 static inline void
@@ -334,10 +334,10 @@ hash_netnet6_data_list(struct sk_buff *skb,
            (flags &&
             nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
                goto nla_put_failure;
-       return 0;
+       return false;
 
 nla_put_failure:
-       return 1;
+       return true;
 }
 
 static inline void
index 7097fb0141bf6e1363ca0b0342451e66c34773b4..1c645fbd09c7d6bcb337b90977ae2536b2ec9ebd 100644 (file)
@@ -28,7 +28,8 @@
 /*                             2    Range as input support for IPv4 added */
 /*                             3    nomatch flag support added */
 /*                             4    Counters support added */
-#define IPSET_TYPE_REV_MAX     5 /* Comments support added */
+/*                             5    Comments support added */
+#define IPSET_TYPE_REV_MAX     6 /* Forceadd support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
index 703d1192a6a225214f1ffd28c6ab926110942c32..c0d2ba73f8b2394995bba3737a833cc47138d581 100644 (file)
@@ -25,7 +25,8 @@
 #include <linux/netfilter/ipset/ip_set_hash.h>
 
 #define IPSET_TYPE_REV_MIN     0
-#define IPSET_TYPE_REV_MAX     0 /* Comments support added */
+/*                             0    Comments support added */
+#define IPSET_TYPE_REV_MAX     1 /* Forceadd support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
index 4f29fa97044b18523967cdc8b8c6ab272376a2fb..04d15fdc99eec407545c2c598c7a158de0daade9 100644 (file)
@@ -7,8 +7,8 @@
 
 #define E(a, b, c, d) \
        {.ip6 = { \
-               __constant_htonl(a), __constant_htonl(b), \
-               __constant_htonl(c), __constant_htonl(d), \
+               htonl(a), htonl(b), \
+               htonl(c), htonl(d), \
        } }
 
 /*
index d6d75841352a261848e33ecd63a99290eb5e9f9c..c42e83d2751cdc2dd1324a3b9db3e36b615d2fb5 100644 (file)
@@ -3580,7 +3580,7 @@ out:
 }
 
 
-static const struct genl_ops ip_vs_genl_ops[] __read_mostly = {
+static const struct genl_ops ip_vs_genl_ops[] = {
        {
                .cmd    = IPVS_CMD_NEW_SERVICE,
                .flags  = GENL_ADMIN_PERM,
index ca056a331e60b23f1b6e542fd640b2b996878335..547ff33c1efdb0cb92f3f8890640a77bba3a73b5 100644 (file)
@@ -238,7 +238,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc)
 
        spin_lock_bh(&svc->sched_lock);
        tbl->dead = 1;
-       for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+       for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
                hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
                        ip_vs_lblc_del(en);
                        atomic_dec(&tbl->entries);
@@ -265,7 +265,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
        unsigned long now = jiffies;
        int i, j;
 
-       for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+       for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
                j = (j + 1) & IP_VS_LBLC_TAB_MASK;
 
                spin_lock(&svc->sched_lock);
@@ -321,7 +321,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
        if (goal > tbl->max_size/2)
                goal = tbl->max_size/2;
 
-       for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+       for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
                j = (j + 1) & IP_VS_LBLC_TAB_MASK;
 
                spin_lock(&svc->sched_lock);
@@ -340,7 +340,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
        tbl->rover = j;
 
   out:
-       mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+       mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
 }
 
 
@@ -363,7 +363,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
        /*
         *    Initialize the hash buckets
         */
-       for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+       for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
                INIT_HLIST_HEAD(&tbl->bucket[i]);
        }
        tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
@@ -536,8 +536,7 @@ out:
 /*
  *      IPVS LBLC Scheduler structure
  */
-static struct ip_vs_scheduler ip_vs_lblc_scheduler =
-{
+static struct ip_vs_scheduler ip_vs_lblc_scheduler = {
        .name =                 "lblc",
        .refcnt =               ATOMIC_INIT(0),
        .module =               THIS_MODULE,
index 356bef519fe5b781f6225557f64186c0075852a1..6dba48efe01e83bebda511e56410b403a8d4b968 100644 (file)
@@ -60,8 +60,59 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
                                      const struct nlattr *attr) __read_mostly;
 EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
 
-DEFINE_SPINLOCK(nf_conntrack_lock);
-EXPORT_SYMBOL_GPL(nf_conntrack_lock);
+__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
+EXPORT_SYMBOL_GPL(nf_conntrack_locks);
+
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
+EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
+
+static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
+{
+       h1 %= CONNTRACK_LOCKS;
+       h2 %= CONNTRACK_LOCKS;
+       spin_unlock(&nf_conntrack_locks[h1]);
+       if (h1 != h2)
+               spin_unlock(&nf_conntrack_locks[h2]);
+}
+
+/* return true if we need to recompute hashes (in case hash table was resized) */
+static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
+                                    unsigned int h2, unsigned int sequence)
+{
+       h1 %= CONNTRACK_LOCKS;
+       h2 %= CONNTRACK_LOCKS;
+       if (h1 <= h2) {
+               spin_lock(&nf_conntrack_locks[h1]);
+               if (h1 != h2)
+                       spin_lock_nested(&nf_conntrack_locks[h2],
+                                        SINGLE_DEPTH_NESTING);
+       } else {
+               spin_lock(&nf_conntrack_locks[h2]);
+               spin_lock_nested(&nf_conntrack_locks[h1],
+                                SINGLE_DEPTH_NESTING);
+       }
+       if (read_seqcount_retry(&net->ct.generation, sequence)) {
+               nf_conntrack_double_unlock(h1, h2);
+               return true;
+       }
+       return false;
+}
+
+static void nf_conntrack_all_lock(void)
+{
+       int i;
+
+       for (i = 0; i < CONNTRACK_LOCKS; i++)
+               spin_lock_nested(&nf_conntrack_locks[i], i);
+}
+
+static void nf_conntrack_all_unlock(void)
+{
+       int i;
+
+       for (i = 0; i < CONNTRACK_LOCKS; i++)
+               spin_unlock(&nf_conntrack_locks[i]);
+}
 
 unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
@@ -192,6 +243,50 @@ clean_from_lists(struct nf_conn *ct)
        nf_ct_remove_expectations(ct);
 }
 
+/* must be called with local_bh_disable */
+static void nf_ct_add_to_dying_list(struct nf_conn *ct)
+{
+       struct ct_pcpu *pcpu;
+
+       /* add this conntrack to the (per cpu) dying list */
+       ct->cpu = smp_processor_id();
+       pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
+
+       spin_lock(&pcpu->lock);
+       hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+                            &pcpu->dying);
+       spin_unlock(&pcpu->lock);
+}
+
+/* must be called with local_bh_disable */
+static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
+{
+       struct ct_pcpu *pcpu;
+
+       /* add this conntrack to the (per cpu) unconfirmed list */
+       ct->cpu = smp_processor_id();
+       pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
+
+       spin_lock(&pcpu->lock);
+       hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+                            &pcpu->unconfirmed);
+       spin_unlock(&pcpu->lock);
+}
+
+/* must be called with local_bh_disable */
+static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
+{
+       struct ct_pcpu *pcpu;
+
+       /* We overload first tuple to link into unconfirmed or dying list.*/
+       pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
+
+       spin_lock(&pcpu->lock);
+       BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
+       hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+       spin_unlock(&pcpu->lock);
+}
+
 static void
 destroy_conntrack(struct nf_conntrack *nfct)
 {
@@ -203,9 +298,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
        NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
        NF_CT_ASSERT(!timer_pending(&ct->timeout));
 
-       /* To make sure we don't get any weird locking issues here:
-        * destroy_conntrack() MUST NOT be called with a write lock
-        * to nf_conntrack_lock!!! -HW */
        rcu_read_lock();
        l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
        if (l4proto && l4proto->destroy)
@@ -213,19 +305,18 @@ destroy_conntrack(struct nf_conntrack *nfct)
 
        rcu_read_unlock();
 
-       spin_lock_bh(&nf_conntrack_lock);
+       local_bh_disable();
        /* Expectations will have been removed in clean_from_lists,
         * except TFTP can create an expectation on the first packet,
         * before connection is in the list, so we need to clean here,
-        * too. */
+        * too.
+        */
        nf_ct_remove_expectations(ct);
 
-       /* We overload first tuple to link into unconfirmed or dying list.*/
-       BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
-       hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+       nf_ct_del_from_dying_or_unconfirmed_list(ct);
 
        NF_CT_STAT_INC(net, delete);
-       spin_unlock_bh(&nf_conntrack_lock);
+       local_bh_enable();
 
        if (ct->master)
                nf_ct_put(ct->master);
@@ -237,17 +328,28 @@ destroy_conntrack(struct nf_conntrack *nfct)
 static void nf_ct_delete_from_lists(struct nf_conn *ct)
 {
        struct net *net = nf_ct_net(ct);
+       unsigned int hash, reply_hash;
+       u16 zone = nf_ct_zone(ct);
+       unsigned int sequence;
 
        nf_ct_helper_destroy(ct);
-       spin_lock_bh(&nf_conntrack_lock);
-       /* Inside lock so preempt is disabled on module removal path.
-        * Otherwise we can get spurious warnings. */
-       NF_CT_STAT_INC(net, delete_list);
+
+       local_bh_disable();
+       do {
+               sequence = read_seqcount_begin(&net->ct.generation);
+               hash = hash_conntrack(net, zone,
+                                     &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+               reply_hash = hash_conntrack(net, zone,
+                                          &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+       } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+
        clean_from_lists(ct);
-       /* add this conntrack to the dying list */
-       hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
-                            &net->ct.dying);
-       spin_unlock_bh(&nf_conntrack_lock);
+       nf_conntrack_double_unlock(hash, reply_hash);
+
+       nf_ct_add_to_dying_list(ct);
+
+       NF_CT_STAT_INC(net, delete_list);
+       local_bh_enable();
 }
 
 static void death_by_event(unsigned long ul_conntrack)
@@ -331,8 +433,6 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
  * Warning :
  * - Caller must take a reference on returned object
  *   and recheck nf_ct_tuple_equal(tuple, &h->tuple)
- * OR
- * - Caller must lock nf_conntrack_lock before calling this function
  */
 static struct nf_conntrack_tuple_hash *
 ____nf_conntrack_find(struct net *net, u16 zone,
@@ -408,32 +508,36 @@ EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 
 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
                                       unsigned int hash,
-                                      unsigned int repl_hash)
+                                      unsigned int reply_hash)
 {
        struct net *net = nf_ct_net(ct);
 
        hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
                           &net->ct.hash[hash]);
        hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
-                          &net->ct.hash[repl_hash]);
+                          &net->ct.hash[reply_hash]);
 }
 
 int
 nf_conntrack_hash_check_insert(struct nf_conn *ct)
 {
        struct net *net = nf_ct_net(ct);
-       unsigned int hash, repl_hash;
+       unsigned int hash, reply_hash;
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_node *n;
        u16 zone;
+       unsigned int sequence;
 
        zone = nf_ct_zone(ct);
-       hash = hash_conntrack(net, zone,
-                             &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-       repl_hash = hash_conntrack(net, zone,
-                                  &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 
-       spin_lock_bh(&nf_conntrack_lock);
+       local_bh_disable();
+       do {
+               sequence = read_seqcount_begin(&net->ct.generation);
+               hash = hash_conntrack(net, zone,
+                                     &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+               reply_hash = hash_conntrack(net, zone,
+                                          &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+       } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 
        /* See if there's one in the list already, including reverse */
        hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
@@ -441,7 +545,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
                                      &h->tuple) &&
                    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
                        goto out;
-       hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
+       hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
                if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
                                      &h->tuple) &&
                    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
@@ -451,15 +555,16 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
        smp_wmb();
        /* The caller holds a reference to this object */
        atomic_set(&ct->ct_general.use, 2);
-       __nf_conntrack_hash_insert(ct, hash, repl_hash);
+       __nf_conntrack_hash_insert(ct, hash, reply_hash);
+       nf_conntrack_double_unlock(hash, reply_hash);
        NF_CT_STAT_INC(net, insert);
-       spin_unlock_bh(&nf_conntrack_lock);
-
+       local_bh_enable();
        return 0;
 
 out:
+       nf_conntrack_double_unlock(hash, reply_hash);
        NF_CT_STAT_INC(net, insert_failed);
-       spin_unlock_bh(&nf_conntrack_lock);
+       local_bh_enable();
        return -EEXIST;
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
@@ -467,15 +572,22 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
 /* deletion from this larval template list happens via nf_ct_put() */
 void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl)
 {
+       struct ct_pcpu *pcpu;
+
        __set_bit(IPS_TEMPLATE_BIT, &tmpl->status);
        __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
        nf_conntrack_get(&tmpl->ct_general);
 
-       spin_lock_bh(&nf_conntrack_lock);
+       /* add this conntrack to the (per cpu) tmpl list */
+       local_bh_disable();
+       tmpl->cpu = smp_processor_id();
+       pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu);
+
+       spin_lock(&pcpu->lock);
        /* Overload tuple linked list to put us in template list. */
        hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
-                                &net->ct.tmpl);
-       spin_unlock_bh(&nf_conntrack_lock);
+                                &pcpu->tmpl);
+       spin_unlock_bh(&pcpu->lock);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);
 
@@ -483,7 +595,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);
 int
 __nf_conntrack_confirm(struct sk_buff *skb)
 {
-       unsigned int hash, repl_hash;
+       unsigned int hash, reply_hash;
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct;
        struct nf_conn_help *help;
@@ -492,6 +604,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
        enum ip_conntrack_info ctinfo;
        struct net *net;
        u16 zone;
+       unsigned int sequence;
 
        ct = nf_ct_get(skb, &ctinfo);
        net = nf_ct_net(ct);
@@ -504,31 +617,37 @@ __nf_conntrack_confirm(struct sk_buff *skb)
                return NF_ACCEPT;
 
        zone = nf_ct_zone(ct);
-       /* reuse the hash saved before */
-       hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
-       hash = hash_bucket(hash, net);
-       repl_hash = hash_conntrack(net, zone,
-                                  &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+       local_bh_disable();
+
+       do {
+               sequence = read_seqcount_begin(&net->ct.generation);
+               /* reuse the hash saved before */
+               hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
+               hash = hash_bucket(hash, net);
+               reply_hash = hash_conntrack(net, zone,
+                                          &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+       } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 
        /* We're not in hash table, and we refuse to set up related
-          connections for unconfirmed conns.  But packet copies and
-          REJECT will give spurious warnings here. */
+        * connections for unconfirmed conns.  But packet copies and
+        * REJECT will give spurious warnings here.
+        */
        /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
 
        /* No external references means no one else could have
-          confirmed us. */
+        * confirmed us.
+        */
        NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
        pr_debug("Confirming conntrack %p\n", ct);
-
-       spin_lock_bh(&nf_conntrack_lock);
-
        /* We have to check the DYING flag inside the lock to prevent
           a race against nf_ct_get_next_corpse() possibly called from
           user context, else we insert an already 'dead' hash, blocking
           further use of that particular connection -JM */
 
        if (unlikely(nf_ct_is_dying(ct))) {
-               spin_unlock_bh(&nf_conntrack_lock);
+               nf_conntrack_double_unlock(hash, reply_hash);
+               local_bh_enable();
                return NF_ACCEPT;
        }
 
@@ -540,14 +659,13 @@ __nf_conntrack_confirm(struct sk_buff *skb)
                                      &h->tuple) &&
                    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
                        goto out;
-       hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
+       hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
                if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
                                      &h->tuple) &&
                    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
                        goto out;
 
-       /* Remove from unconfirmed list */
-       hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+       nf_ct_del_from_dying_or_unconfirmed_list(ct);
 
        /* Timer relative to confirmation time, not original
           setting time, otherwise we'd get timer wrap in
@@ -570,9 +688,10 @@ __nf_conntrack_confirm(struct sk_buff *skb)
         * guarantee that no other CPU can find the conntrack before the above
         * stores are visible.
         */
-       __nf_conntrack_hash_insert(ct, hash, repl_hash);
+       __nf_conntrack_hash_insert(ct, hash, reply_hash);
+       nf_conntrack_double_unlock(hash, reply_hash);
        NF_CT_STAT_INC(net, insert);
-       spin_unlock_bh(&nf_conntrack_lock);
+       local_bh_enable();
 
        help = nfct_help(ct);
        if (help && help->helper)
@@ -583,8 +702,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
        return NF_ACCEPT;
 
 out:
+       nf_conntrack_double_unlock(hash, reply_hash);
        NF_CT_STAT_INC(net, insert_failed);
-       spin_unlock_bh(&nf_conntrack_lock);
+       local_bh_enable();
        return NF_DROP;
 }
 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
@@ -627,39 +747,48 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
 
 /* There's a small race here where we may free a just-assured
    connection.  Too bad: we're in trouble anyway. */
-static noinline int early_drop(struct net *net, unsigned int hash)
+static noinline int early_drop(struct net *net, unsigned int _hash)
 {
        /* Use oldest entry, which is roughly LRU */
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct = NULL, *tmp;
        struct hlist_nulls_node *n;
-       unsigned int i, cnt = 0;
+       unsigned int i = 0, cnt = 0;
        int dropped = 0;
+       unsigned int hash, sequence;
+       spinlock_t *lockp;
 
-       rcu_read_lock();
-       for (i = 0; i < net->ct.htable_size; i++) {
+       local_bh_disable();
+restart:
+       sequence = read_seqcount_begin(&net->ct.generation);
+       hash = hash_bucket(_hash, net);
+       for (; i < net->ct.htable_size; i++) {
+               lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
+               spin_lock(lockp);
+               if (read_seqcount_retry(&net->ct.generation, sequence)) {
+                       spin_unlock(lockp);
+                       goto restart;
+               }
                hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
                                         hnnode) {
                        tmp = nf_ct_tuplehash_to_ctrack(h);
-                       if (!test_bit(IPS_ASSURED_BIT, &tmp->status))
+                       if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
+                           !nf_ct_is_dying(tmp) &&
+                           atomic_inc_not_zero(&tmp->ct_general.use)) {
                                ct = tmp;
+                               break;
+                       }
                        cnt++;
                }
 
-               if (ct != NULL) {
-                       if (likely(!nf_ct_is_dying(ct) &&
-                                  atomic_inc_not_zero(&ct->ct_general.use)))
-                               break;
-                       else
-                               ct = NULL;
-               }
+               hash = (hash + 1) % net->ct.htable_size;
+               spin_unlock(lockp);
 
-               if (cnt >= NF_CT_EVICTION_RANGE)
+               if (ct || cnt >= NF_CT_EVICTION_RANGE)
                        break;
 
-               hash = (hash + 1) % net->ct.htable_size;
        }
-       rcu_read_unlock();
+       local_bh_enable();
 
        if (!ct)
                return dropped;
@@ -708,7 +837,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
 
        if (nf_conntrack_max &&
            unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
-               if (!early_drop(net, hash_bucket(hash, net))) {
+               if (!early_drop(net, hash)) {
                        atomic_dec(&net->ct.count);
                        net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
                        return ERR_PTR(-ENOMEM);
@@ -805,7 +934,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
        struct nf_conn_help *help;
        struct nf_conntrack_tuple repl_tuple;
        struct nf_conntrack_ecache *ecache;
-       struct nf_conntrack_expect *exp;
+       struct nf_conntrack_expect *exp = NULL;
        u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
        struct nf_conn_timeout *timeout_ext;
        unsigned int *timeouts;
@@ -849,42 +978,44 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
                                 ecache ? ecache->expmask : 0,
                             GFP_ATOMIC);
 
-       spin_lock_bh(&nf_conntrack_lock);
-       exp = nf_ct_find_expectation(net, zone, tuple);
-       if (exp) {
-               pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
-                        ct, exp);
-               /* Welcome, Mr. Bond.  We've been expecting you... */
-               __set_bit(IPS_EXPECTED_BIT, &ct->status);
-               ct->master = exp->master;
-               if (exp->helper) {
-                       help = nf_ct_helper_ext_add(ct, exp->helper,
-                                                   GFP_ATOMIC);
-                       if (help)
-                               rcu_assign_pointer(help->helper, exp->helper);
-               }
+       local_bh_disable();
+       if (net->ct.expect_count) {
+               spin_lock(&nf_conntrack_expect_lock);
+               exp = nf_ct_find_expectation(net, zone, tuple);
+               if (exp) {
+                       pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
+                                ct, exp);
+                       /* Welcome, Mr. Bond.  We've been expecting you... */
+                       __set_bit(IPS_EXPECTED_BIT, &ct->status);
+                       /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
+                       ct->master = exp->master;
+                       if (exp->helper) {
+                               help = nf_ct_helper_ext_add(ct, exp->helper,
+                                                           GFP_ATOMIC);
+                               if (help)
+                                       rcu_assign_pointer(help->helper, exp->helper);
+                       }
 
 #ifdef CONFIG_NF_CONNTRACK_MARK
-               ct->mark = exp->master->mark;
+                       ct->mark = exp->master->mark;
 #endif
 #ifdef CONFIG_NF_CONNTRACK_SECMARK
-               ct->secmark = exp->master->secmark;
+                       ct->secmark = exp->master->secmark;
 #endif
-               nf_conntrack_get(&ct->master->ct_general);
-               NF_CT_STAT_INC(net, expect_new);
-       } else {
+                       NF_CT_STAT_INC(net, expect_new);
+               }
+               spin_unlock(&nf_conntrack_expect_lock);
+       }
+       if (!exp) {
                __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
                NF_CT_STAT_INC(net, new);
        }
 
        /* Now it is inserted into the unconfirmed list, bump refcount */
        nf_conntrack_get(&ct->ct_general);
+       nf_ct_add_to_unconfirmed_list(ct);
 
-       /* Overload tuple linked list to put us in unconfirmed list. */
-       hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
-                      &net->ct.unconfirmed);
-
-       spin_unlock_bh(&nf_conntrack_lock);
+       local_bh_enable();
 
        if (exp) {
                if (exp->expectfn)
@@ -1254,27 +1385,42 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct;
        struct hlist_nulls_node *n;
+       int cpu;
+       spinlock_t *lockp;
 
-       spin_lock_bh(&nf_conntrack_lock);
        for (; *bucket < net->ct.htable_size; (*bucket)++) {
-               hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
-                       if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
-                               continue;
+               lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
+               local_bh_disable();
+               spin_lock(lockp);
+               if (*bucket < net->ct.htable_size) {
+                       hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
+                               if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+                                       continue;
+                               ct = nf_ct_tuplehash_to_ctrack(h);
+                               if (iter(ct, data))
+                                       goto found;
+                       }
+               }
+               spin_unlock(lockp);
+               local_bh_enable();
+       }
+
+       for_each_possible_cpu(cpu) {
+               struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+               spin_lock_bh(&pcpu->lock);
+               hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
                        ct = nf_ct_tuplehash_to_ctrack(h);
                        if (iter(ct, data))
-                               goto found;
+                               set_bit(IPS_DYING_BIT, &ct->status);
                }
+               spin_unlock_bh(&pcpu->lock);
        }
-       hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) {
-               ct = nf_ct_tuplehash_to_ctrack(h);
-               if (iter(ct, data))
-                       set_bit(IPS_DYING_BIT, &ct->status);
-       }
-       spin_unlock_bh(&nf_conntrack_lock);
        return NULL;
 found:
        atomic_inc(&ct->ct_general.use);
-       spin_unlock_bh(&nf_conntrack_lock);
+       spin_unlock(lockp);
+       local_bh_enable();
        return ct;
 }
 
@@ -1323,14 +1469,19 @@ static void nf_ct_release_dying_list(struct net *net)
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct;
        struct hlist_nulls_node *n;
+       int cpu;
 
-       spin_lock_bh(&nf_conntrack_lock);
-       hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) {
-               ct = nf_ct_tuplehash_to_ctrack(h);
-               /* never fails to remove them, no listeners at this point */
-               nf_ct_kill(ct);
+       for_each_possible_cpu(cpu) {
+               struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+               spin_lock_bh(&pcpu->lock);
+               hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
+                       ct = nf_ct_tuplehash_to_ctrack(h);
+                       /* never fails to remove them, no listeners at this point */
+                       nf_ct_kill(ct);
+               }
+               spin_unlock_bh(&pcpu->lock);
        }
-       spin_unlock_bh(&nf_conntrack_lock);
 }
 
 static int untrack_refs(void)
@@ -1417,6 +1568,7 @@ i_see_dead_people:
                kmem_cache_destroy(net->ct.nf_conntrack_cachep);
                kfree(net->ct.slabname);
                free_percpu(net->ct.stat);
+               free_percpu(net->ct.pcpu_lists);
        }
 }
 
@@ -1469,12 +1621,16 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
        if (!hash)
                return -ENOMEM;
 
+       local_bh_disable();
+       nf_conntrack_all_lock();
+       write_seqcount_begin(&init_net.ct.generation);
+
        /* Lookups in the old hash might happen in parallel, which means we
         * might get false negatives during connection lookup. New connections
         * created because of a false negative won't make it into the hash
-        * though since that required taking the lock.
+        * though since that required taking the locks.
         */
-       spin_lock_bh(&nf_conntrack_lock);
+
        for (i = 0; i < init_net.ct.htable_size; i++) {
                while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
                        h = hlist_nulls_entry(init_net.ct.hash[i].first,
@@ -1491,7 +1647,10 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
 
        init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
        init_net.ct.hash = hash;
-       spin_unlock_bh(&nf_conntrack_lock);
+
+       write_seqcount_end(&init_net.ct.generation);
+       nf_conntrack_all_unlock();
+       local_bh_enable();
 
        nf_ct_free_hashtable(old_hash, old_size);
        return 0;
@@ -1513,7 +1672,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
 int nf_conntrack_init_start(void)
 {
        int max_factor = 8;
-       int ret, cpu;
+       int i, ret, cpu;
+
+       for (i = 0; i < CONNTRACK_LOCKS; i++)
+               spin_lock_init(&nf_conntrack_locks[i]);
 
        /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
         * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
@@ -1629,37 +1791,43 @@ void nf_conntrack_init_end(void)
 
 int nf_conntrack_init_net(struct net *net)
 {
-       int ret;
+       int ret = -ENOMEM;
+       int cpu;
 
        atomic_set(&net->ct.count, 0);
-       INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL);
-       INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL);
-       INIT_HLIST_NULLS_HEAD(&net->ct.tmpl, TEMPLATE_NULLS_VAL);
-       net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
-       if (!net->ct.stat) {
-               ret = -ENOMEM;
+
+       net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
+       if (!net->ct.pcpu_lists)
                goto err_stat;
+
+       for_each_possible_cpu(cpu) {
+               struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+               spin_lock_init(&pcpu->lock);
+               INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
+               INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
+               INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL);
        }
 
+       net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
+       if (!net->ct.stat)
+               goto err_pcpu_lists;
+
        net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
-       if (!net->ct.slabname) {
-               ret = -ENOMEM;
+       if (!net->ct.slabname)
                goto err_slabname;
-       }
 
        net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
                                                        sizeof(struct nf_conn), 0,
                                                        SLAB_DESTROY_BY_RCU, NULL);
        if (!net->ct.nf_conntrack_cachep) {
                printk(KERN_ERR "Unable to create nf_conn slab cache\n");
-               ret = -ENOMEM;
                goto err_cache;
        }
 
        net->ct.htable_size = nf_conntrack_htable_size;
        net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
        if (!net->ct.hash) {
-               ret = -ENOMEM;
                printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
                goto err_hash;
        }
@@ -1701,6 +1869,8 @@ err_cache:
        kfree(net->ct.slabname);
 err_slabname:
        free_percpu(net->ct.stat);
+err_pcpu_lists:
+       free_percpu(net->ct.pcpu_lists);
 err_stat:
        return ret;
 }
index 4fd1ca94fd4a140b374385ded878af60b503b529..f87e8f68ad453e9baeec017cc74534e8ce85dfab 100644 (file)
@@ -66,9 +66,9 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect)
 {
        struct nf_conntrack_expect *exp = (void *)ul_expect;
 
-       spin_lock_bh(&nf_conntrack_lock);
+       spin_lock_bh(&nf_conntrack_expect_lock);
        nf_ct_unlink_expect(exp);
-       spin_unlock_bh(&nf_conntrack_lock);
+       spin_unlock_bh(&nf_conntrack_expect_lock);
        nf_ct_expect_put(exp);
 }
 
@@ -155,6 +155,18 @@ nf_ct_find_expectation(struct net *net, u16 zone,
        if (!nf_ct_is_confirmed(exp->master))
                return NULL;
 
+       /* Avoid race with other CPUs, that for exp->master ct, is
+        * about to invoke ->destroy(), or nf_ct_delete() via timeout
+        * or early_drop().
+        *
+        * The atomic_inc_not_zero() check tells:  If that fails, we
+        * know that the ct is being destroyed.  If it succeeds, we
+        * can be sure the ct cannot disappear underneath.
+        */
+       if (unlikely(nf_ct_is_dying(exp->master) ||
+                    !atomic_inc_not_zero(&exp->master->ct_general.use)))
+               return NULL;
+
        if (exp->flags & NF_CT_EXPECT_PERMANENT) {
                atomic_inc(&exp->use);
                return exp;
@@ -162,6 +174,8 @@ nf_ct_find_expectation(struct net *net, u16 zone,
                nf_ct_unlink_expect(exp);
                return exp;
        }
+       /* Undo exp->master refcnt increase, if del_timer() failed */
+       nf_ct_put(exp->master);
 
        return NULL;
 }
@@ -177,12 +191,14 @@ void nf_ct_remove_expectations(struct nf_conn *ct)
        if (!help)
                return;
 
+       spin_lock_bh(&nf_conntrack_expect_lock);
        hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
                if (del_timer(&exp->timeout)) {
                        nf_ct_unlink_expect(exp);
                        nf_ct_expect_put(exp);
                }
        }
+       spin_unlock_bh(&nf_conntrack_expect_lock);
 }
 EXPORT_SYMBOL_GPL(nf_ct_remove_expectations);
 
@@ -217,12 +233,12 @@ static inline int expect_matches(const struct nf_conntrack_expect *a,
 /* Generally a bad idea to call this: could have matched already. */
 void nf_ct_unexpect_related(struct nf_conntrack_expect *exp)
 {
-       spin_lock_bh(&nf_conntrack_lock);
+       spin_lock_bh(&nf_conntrack_expect_lock);
        if (del_timer(&exp->timeout)) {
                nf_ct_unlink_expect(exp);
                nf_ct_expect_put(exp);
        }
-       spin_unlock_bh(&nf_conntrack_lock);
+       spin_unlock_bh(&nf_conntrack_expect_lock);
 }
 EXPORT_SYMBOL_GPL(nf_ct_unexpect_related);
 
@@ -335,7 +351,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
        setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
                    (unsigned long)exp);
        helper = rcu_dereference_protected(master_help->helper,
-                                          lockdep_is_held(&nf_conntrack_lock));
+                                          lockdep_is_held(&nf_conntrack_expect_lock));
        if (helper) {
                exp->timeout.expires = jiffies +
                        helper->expect_policy[exp->class].timeout * HZ;
@@ -395,7 +411,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
        }
        /* Will be over limit? */
        helper = rcu_dereference_protected(master_help->helper,
-                                          lockdep_is_held(&nf_conntrack_lock));
+                                          lockdep_is_held(&nf_conntrack_expect_lock));
        if (helper) {
                p = &helper->expect_policy[expect->class];
                if (p->max_expected &&
@@ -417,12 +433,12 @@ out:
        return ret;
 }
 
-int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, 
+int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
                                u32 portid, int report)
 {
        int ret;
 
-       spin_lock_bh(&nf_conntrack_lock);
+       spin_lock_bh(&nf_conntrack_expect_lock);
        ret = __nf_ct_expect_check(expect);
        if (ret <= 0)
                goto out;
@@ -430,11 +446,11 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
        ret = nf_ct_expect_insert(expect);
        if (ret < 0)
                goto out;
-       spin_unlock_bh(&nf_conntrack_lock);
+       spin_unlock_bh(&nf_conntrack_expect_lock);
        nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
        return ret;
 out:
-       spin_unlock_bh(&nf_conntrack_lock);
+       spin_unlock_bh(&nf_conntrack_expect_lock);
        return ret;
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
index 70866d192efc9351f2fcafe3ef23ce0c131fc0ba..3a3a60b126e0097f309badc40ebf8016b3773c98 100644 (file)
@@ -1476,7 +1476,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
                nf_ct_refresh(ct, skb, info->timeout * HZ);
 
                /* Set expect timeout */
-               spin_lock_bh(&nf_conntrack_lock);
+               spin_lock_bh(&nf_conntrack_expect_lock);
                exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3,
                                  info->sig_port[!dir]);
                if (exp) {
@@ -1486,7 +1486,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
                        nf_ct_dump_tuple(&exp->tuple);
                        set_expect_timeout(exp, info->timeout);
                }
-               spin_unlock_bh(&nf_conntrack_lock);
+               spin_unlock_bh(&nf_conntrack_expect_lock);
        }
 
        return 0;
index 974a2a4adefa739c729692e0df3ad3142f83d0d4..5b3eae7d4c9a51dd1bdd559fa5943125035d8490 100644 (file)
@@ -250,16 +250,14 @@ out:
 }
 EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
 
+/* appropiate ct lock protecting must be taken by caller */
 static inline int unhelp(struct nf_conntrack_tuple_hash *i,
                         const struct nf_conntrack_helper *me)
 {
        struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
        struct nf_conn_help *help = nfct_help(ct);
 
-       if (help && rcu_dereference_protected(
-                       help->helper,
-                       lockdep_is_held(&nf_conntrack_lock)
-                       ) == me) {
+       if (help && rcu_dereference_raw(help->helper) == me) {
                nf_conntrack_event(IPCT_HELPER, ct);
                RCU_INIT_POINTER(help->helper, NULL);
        }
@@ -284,17 +282,17 @@ static LIST_HEAD(nf_ct_helper_expectfn_list);
 
 void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n)
 {
-       spin_lock_bh(&nf_conntrack_lock);
+       spin_lock_bh(&nf_conntrack_expect_lock);
        list_add_rcu(&n->head, &nf_ct_helper_expectfn_list);
-       spin_unlock_bh(&nf_conntrack_lock);
+       spin_unlock_bh(&nf_conntrack_expect_lock);
 }
 EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register);
 
 void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n)
 {
-       spin_lock_bh(&nf_conntrack_lock);
+       spin_lock_bh(&nf_conntrack_expect_lock);
        list_del_rcu(&n->head);
-       spin_unlock_bh(&nf_conntrack_lock);
+       spin_unlock_bh(&nf_conntrack_expect_lock);
 }
 EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister);
 
@@ -396,15 +394,17 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
        const struct hlist_node *next;
        const struct hlist_nulls_node *nn;
        unsigned int i;
+       int cpu;
 
        /* Get rid of expectations */
+       spin_lock_bh(&nf_conntrack_expect_lock);
        for (i = 0; i < nf_ct_expect_hsize; i++) {
                hlist_for_each_entry_safe(exp, next,
                                          &net->ct.expect_hash[i], hnode) {
                        struct nf_conn_help *help = nfct_help(exp->master);
                        if ((rcu_dereference_protected(
                                        help->helper,
-                                       lockdep_is_held(&nf_conntrack_lock)
+                                       lockdep_is_held(&nf_conntrack_expect_lock)
                                        ) == me || exp->helper == me) &&
                            del_timer(&exp->timeout)) {
                                nf_ct_unlink_expect(exp);
@@ -412,14 +412,27 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
                        }
                }
        }
+       spin_unlock_bh(&nf_conntrack_expect_lock);
 
        /* Get rid of expecteds, set helpers to NULL. */
-       hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode)
-               unhelp(h, me);
-       for (i = 0; i < net->ct.htable_size; i++) {
-               hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+       for_each_possible_cpu(cpu) {
+               struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+               spin_lock_bh(&pcpu->lock);
+               hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode)
                        unhelp(h, me);
+               spin_unlock_bh(&pcpu->lock);
+       }
+       local_bh_disable();
+       for (i = 0; i < net->ct.htable_size; i++) {
+               spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+               if (i < net->ct.htable_size) {
+                       hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+                               unhelp(h, me);
+               }
+               spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
        }
+       local_bh_enable();
 }
 
 void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
@@ -437,10 +450,8 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
        synchronize_rcu();
 
        rtnl_lock();
-       spin_lock_bh(&nf_conntrack_lock);
        for_each_net(net)
                __nf_conntrack_helper_unregister(me, net);
-       spin_unlock_bh(&nf_conntrack_lock);
        rtnl_unlock();
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
index b9f0e03743228ec852eee97eb6bb9ef1e0ab73e9..ccc46fa5edbce5e52710a22ae502e49a0f59e0a5 100644 (file)
@@ -764,14 +764,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
        struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        u_int8_t l3proto = nfmsg->nfgen_family;
        int res;
+       spinlock_t *lockp;
+
 #ifdef CONFIG_NF_CONNTRACK_MARK
        const struct ctnetlink_dump_filter *filter = cb->data;
 #endif
 
-       spin_lock_bh(&nf_conntrack_lock);
        last = (struct nf_conn *)cb->args[1];
+
+       local_bh_disable();
        for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
 restart:
+               lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
+               spin_lock(lockp);
+               if (cb->args[0] >= net->ct.htable_size) {
+                       spin_unlock(lockp);
+                       goto out;
+               }
                hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
                                         hnnode) {
                        if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
@@ -803,16 +812,18 @@ restart:
                        if (res < 0) {
                                nf_conntrack_get(&ct->ct_general);
                                cb->args[1] = (unsigned long)ct;
+                               spin_unlock(lockp);
                                goto out;
                        }
                }
+               spin_unlock(lockp);
                if (cb->args[1]) {
                        cb->args[1] = 0;
                        goto restart;
                }
        }
 out:
-       spin_unlock_bh(&nf_conntrack_lock);
+       local_bh_enable();
        if (last)
                nf_ct_put(last);
 
@@ -966,7 +977,6 @@ ctnetlink_parse_help(const struct nlattr *attr, char **helper_name,
        return 0;
 }
 
-#define __CTA_LABELS_MAX_LENGTH ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE)
 static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
        [CTA_TUPLE_ORIG]        = { .type = NLA_NESTED },
        [CTA_TUPLE_REPLY]       = { .type = NLA_NESTED },
@@ -984,9 +994,9 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
        [CTA_ZONE]              = { .type = NLA_U16 },
        [CTA_MARK_MASK]         = { .type = NLA_U32 },
        [CTA_LABELS]            = { .type = NLA_BINARY,
-                                   .len = __CTA_LABELS_MAX_LENGTH },
+                                   .len = NF_CT_LABELS_MAX_SIZE },
        [CTA_LABELS_MASK]       = { .type = NLA_BINARY,
-                                   .len = __CTA_LABELS_MAX_LENGTH },
+                                   .len = NF_CT_LABELS_MAX_SIZE },
 };
 
 static int
@@ -1138,50 +1148,65 @@ static int ctnetlink_done_list(struct netlink_callback *cb)
 }
 
 static int
-ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb,
-                   struct hlist_nulls_head *list)
+ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)
 {
-       struct nf_conn *ct, *last;
+       struct nf_conn *ct, *last = NULL;
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_node *n;
        struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        u_int8_t l3proto = nfmsg->nfgen_family;
        int res;
+       int cpu;
+       struct hlist_nulls_head *list;
+       struct net *net = sock_net(skb->sk);
 
        if (cb->args[2])
                return 0;
 
-       spin_lock_bh(&nf_conntrack_lock);
-       last = (struct nf_conn *)cb->args[1];
-restart:
-       hlist_nulls_for_each_entry(h, n, list, hnnode) {
-               ct = nf_ct_tuplehash_to_ctrack(h);
-               if (l3proto && nf_ct_l3num(ct) != l3proto)
+       if (cb->args[0] == nr_cpu_ids)
+               return 0;
+
+       for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
+               struct ct_pcpu *pcpu;
+
+               if (!cpu_possible(cpu))
                        continue;
-               if (cb->args[1]) {
-                       if (ct != last)
+
+               pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+               spin_lock_bh(&pcpu->lock);
+               last = (struct nf_conn *)cb->args[1];
+               list = dying ? &pcpu->dying : &pcpu->unconfirmed;
+restart:
+               hlist_nulls_for_each_entry(h, n, list, hnnode) {
+                       ct = nf_ct_tuplehash_to_ctrack(h);
+                       if (l3proto && nf_ct_l3num(ct) != l3proto)
                                continue;
-                       cb->args[1] = 0;
-               }
-               rcu_read_lock();
-               res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
-                                         cb->nlh->nlmsg_seq,
-                                         NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
-                                         ct);
-               rcu_read_unlock();
-               if (res < 0) {
-                       nf_conntrack_get(&ct->ct_general);
-                       cb->args[1] = (unsigned long)ct;
-                       goto out;
+                       if (cb->args[1]) {
+                               if (ct != last)
+                                       continue;
+                               cb->args[1] = 0;
+                       }
+                       rcu_read_lock();
+                       res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
+                                                 cb->nlh->nlmsg_seq,
+                                                 NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+                                                 ct);
+                       rcu_read_unlock();
+                       if (res < 0) {
+                               nf_conntrack_get(&ct->ct_general);
+                               cb->args[1] = (unsigned long)ct;
+                               spin_unlock_bh(&pcpu->lock);
+                               goto out;
+                       }
                }
+               if (cb->args[1]) {
+                       cb->args[1] = 0;
+                       goto restart;
+               } else
+                       cb->args[2] = 1;
+               spin_unlock_bh(&pcpu->lock);
        }
-       if (cb->args[1]) {
-               cb->args[1] = 0;
-               goto restart;
-       } else
-               cb->args[2] = 1;
 out:
-       spin_unlock_bh(&nf_conntrack_lock);
        if (last)
                nf_ct_put(last);
 
@@ -1191,9 +1216,7 @@ out:
 static int
 ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
 {
-       struct net *net = sock_net(skb->sk);
-
-       return ctnetlink_dump_list(skb, cb, &net->ct.dying);
+       return ctnetlink_dump_list(skb, cb, true);
 }
 
 static int
@@ -1215,9 +1238,7 @@ ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb,
 static int
 ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
 {
-       struct net *net = sock_net(skb->sk);
-
-       return ctnetlink_dump_list(skb, cb, &net->ct.unconfirmed);
+       return ctnetlink_dump_list(skb, cb, false);
 }
 
 static int
@@ -1361,14 +1382,14 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
                                            nf_ct_protonum(ct));
        if (helper == NULL) {
 #ifdef CONFIG_MODULES
-               spin_unlock_bh(&nf_conntrack_lock);
+               spin_unlock_bh(&nf_conntrack_expect_lock);
 
                if (request_module("nfct-helper-%s", helpname) < 0) {
-                       spin_lock_bh(&nf_conntrack_lock);
+                       spin_lock_bh(&nf_conntrack_expect_lock);
                        return -EOPNOTSUPP;
                }
 
-               spin_lock_bh(&nf_conntrack_lock);
+               spin_lock_bh(&nf_conntrack_expect_lock);
                helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
                                                    nf_ct_protonum(ct));
                if (helper)
@@ -1804,9 +1825,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
        err = -EEXIST;
        ct = nf_ct_tuplehash_to_ctrack(h);
        if (!(nlh->nlmsg_flags & NLM_F_EXCL)) {
-               spin_lock_bh(&nf_conntrack_lock);
+               spin_lock_bh(&nf_conntrack_expect_lock);
                err = ctnetlink_change_conntrack(ct, cda);
-               spin_unlock_bh(&nf_conntrack_lock);
+               spin_unlock_bh(&nf_conntrack_expect_lock);
                if (err == 0) {
                        nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
                                                      (1 << IPCT_ASSURED) |
@@ -2135,9 +2156,9 @@ ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct)
        if (ret < 0)
                return ret;
 
-       spin_lock_bh(&nf_conntrack_lock);
+       spin_lock_bh(&nf_conntrack_expect_lock);
        ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct);
-       spin_unlock_bh(&nf_conntrack_lock);
+       spin_unlock_bh(&nf_conntrack_expect_lock);
 
        return ret;
 }
@@ -2692,13 +2713,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
                }
 
                /* after list removal, usage count == 1 */
-               spin_lock_bh(&nf_conntrack_lock);
+               spin_lock_bh(&nf_conntrack_expect_lock);
                if (del_timer(&exp->timeout)) {
                        nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,
                                                   nlmsg_report(nlh));
                        nf_ct_expect_put(exp);
                }
-               spin_unlock_bh(&nf_conntrack_lock);
+               spin_unlock_bh(&nf_conntrack_expect_lock);
                /* have to put what we 'get' above.
                 * after this line usage count == 0 */
                nf_ct_expect_put(exp);
@@ -2707,7 +2728,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
                struct nf_conn_help *m_help;
 
                /* delete all expectations for this helper */
-               spin_lock_bh(&nf_conntrack_lock);
+               spin_lock_bh(&nf_conntrack_expect_lock);
                for (i = 0; i < nf_ct_expect_hsize; i++) {
                        hlist_for_each_entry_safe(exp, next,
                                                  &net->ct.expect_hash[i],
@@ -2722,10 +2743,10 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
                                }
                        }
                }
-               spin_unlock_bh(&nf_conntrack_lock);
+               spin_unlock_bh(&nf_conntrack_expect_lock);
        } else {
                /* This basically means we have to flush everything*/
-               spin_lock_bh(&nf_conntrack_lock);
+               spin_lock_bh(&nf_conntrack_expect_lock);
                for (i = 0; i < nf_ct_expect_hsize; i++) {
                        hlist_for_each_entry_safe(exp, next,
                                                  &net->ct.expect_hash[i],
@@ -2738,7 +2759,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
                                }
                        }
                }
-               spin_unlock_bh(&nf_conntrack_lock);
+               spin_unlock_bh(&nf_conntrack_expect_lock);
        }
 
        return 0;
@@ -2964,11 +2985,11 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
        if (err < 0)
                return err;
 
-       spin_lock_bh(&nf_conntrack_lock);
+       spin_lock_bh(&nf_conntrack_expect_lock);
        exp = __nf_ct_expect_find(net, zone, &tuple);
 
        if (!exp) {
-               spin_unlock_bh(&nf_conntrack_lock);
+               spin_unlock_bh(&nf_conntrack_expect_lock);
                err = -ENOENT;
                if (nlh->nlmsg_flags & NLM_F_CREATE) {
                        err = ctnetlink_create_expect(net, zone, cda,
@@ -2982,7 +3003,7 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
        err = -EEXIST;
        if (!(nlh->nlmsg_flags & NLM_F_EXCL))
                err = ctnetlink_change_expect(exp, cda);
-       spin_unlock_bh(&nf_conntrack_lock);
+       spin_unlock_bh(&nf_conntrack_expect_lock);
 
        return err;
 }
index 466410eaa482c2a3940d3768351e92be17a644ef..4c3ba1c8d682d16abe0912f80525a84d184130db 100644 (file)
@@ -800,7 +800,7 @@ static int refresh_signalling_expectation(struct nf_conn *ct,
        struct hlist_node *next;
        int found = 0;
 
-       spin_lock_bh(&nf_conntrack_lock);
+       spin_lock_bh(&nf_conntrack_expect_lock);
        hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
                if (exp->class != SIP_EXPECT_SIGNALLING ||
                    !nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) ||
@@ -815,7 +815,7 @@ static int refresh_signalling_expectation(struct nf_conn *ct,
                found = 1;
                break;
        }
-       spin_unlock_bh(&nf_conntrack_lock);
+       spin_unlock_bh(&nf_conntrack_expect_lock);
        return found;
 }
 
@@ -825,7 +825,7 @@ static void flush_expectations(struct nf_conn *ct, bool media)
        struct nf_conntrack_expect *exp;
        struct hlist_node *next;
 
-       spin_lock_bh(&nf_conntrack_lock);
+       spin_lock_bh(&nf_conntrack_expect_lock);
        hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
                if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media)
                        continue;
@@ -836,7 +836,7 @@ static void flush_expectations(struct nf_conn *ct, bool media)
                if (!media)
                        break;
        }
-       spin_unlock_bh(&nf_conntrack_lock);
+       spin_unlock_bh(&nf_conntrack_expect_lock);
 }
 
 static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
index adce01e8bb57e7fb9794eebceae274707d96899e..33045a56229769502532b38e7b4a1bc5d89c80d8 100644 (file)
@@ -794,9 +794,8 @@ nf_tables_counters(struct nft_base_chain *chain, const struct nlattr *attr)
        stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
 
        if (chain->stats) {
-               /* nfnl_lock is held, add some nfnl function for this, later */
                struct nft_stats __percpu *oldstats =
-                       rcu_dereference_protected(chain->stats, 1);
+                               nft_dereference(chain->stats);
 
                rcu_assign_pointer(chain->stats, newstats);
                synchronize_rcu();
@@ -1254,10 +1253,11 @@ err1:
        return err;
 }
 
-static void nf_tables_expr_destroy(struct nft_expr *expr)
+static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
+                                  struct nft_expr *expr)
 {
        if (expr->ops->destroy)
-               expr->ops->destroy(expr);
+               expr->ops->destroy(ctx, expr);
        module_put(expr->ops->type->owner);
 }
 
@@ -1296,6 +1296,8 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
        [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED },
        [NFTA_RULE_COMPAT]      = { .type = NLA_NESTED },
        [NFTA_RULE_POSITION]    = { .type = NLA_U64 },
+       [NFTA_RULE_USERDATA]    = { .type = NLA_BINARY,
+                                   .len = NFT_USERDATA_MAXLEN },
 };
 
 static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq,
@@ -1348,6 +1350,10 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq,
        }
        nla_nest_end(skb, list);
 
+       if (rule->ulen &&
+           nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule)))
+               goto nla_put_failure;
+
        return nlmsg_end(skb, nlh);
 
 nla_put_failure:
@@ -1531,7 +1537,8 @@ err:
        return err;
 }
 
-static void nf_tables_rule_destroy(struct nft_rule *rule)
+static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
+                                  struct nft_rule *rule)
 {
        struct nft_expr *expr;
 
@@ -1541,7 +1548,7 @@ static void nf_tables_rule_destroy(struct nft_rule *rule)
         */
        expr = nft_expr_first(rule);
        while (expr->ops && expr != nft_expr_last(rule)) {
-               nf_tables_expr_destroy(expr);
+               nf_tables_expr_destroy(ctx, expr);
                expr = nft_expr_next(expr);
        }
        kfree(rule);
@@ -1552,7 +1559,7 @@ static void nf_tables_rule_destroy(struct nft_rule *rule)
 static struct nft_expr_info *info;
 
 static struct nft_rule_trans *
-nf_tables_trans_add(struct nft_rule *rule, const struct nft_ctx *ctx)
+nf_tables_trans_add(struct nft_ctx *ctx, struct nft_rule *rule)
 {
        struct nft_rule_trans *rupd;
 
@@ -1560,11 +1567,8 @@ nf_tables_trans_add(struct nft_rule *rule, const struct nft_ctx *ctx)
        if (rupd == NULL)
               return NULL;
 
-       rupd->chain = ctx->chain;
-       rupd->table = ctx->table;
+       rupd->ctx = *ctx;
        rupd->rule = rule;
-       rupd->family = ctx->afi->family;
-       rupd->nlh = ctx->nlh;
        list_add_tail(&rupd->list, &ctx->net->nft.commit_list);
 
        return rupd;
@@ -1584,7 +1588,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
        struct nft_expr *expr;
        struct nft_ctx ctx;
        struct nlattr *tmp;
-       unsigned int size, i, n;
+       unsigned int size, i, n, ulen = 0;
        int err, rem;
        bool create;
        u64 handle, pos_handle;
@@ -1650,8 +1654,11 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
                }
        }
 
+       if (nla[NFTA_RULE_USERDATA])
+               ulen = nla_len(nla[NFTA_RULE_USERDATA]);
+
        err = -ENOMEM;
-       rule = kzalloc(sizeof(*rule) + size, GFP_KERNEL);
+       rule = kzalloc(sizeof(*rule) + size + ulen, GFP_KERNEL);
        if (rule == NULL)
                goto err1;
 
@@ -1659,6 +1666,10 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 
        rule->handle = handle;
        rule->dlen   = size;
+       rule->ulen   = ulen;
+
+       if (ulen)
+               nla_memcpy(nft_userdata(rule), nla[NFTA_RULE_USERDATA], ulen);
 
        expr = nft_expr_first(rule);
        for (i = 0; i < n; i++) {
@@ -1671,7 +1682,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 
        if (nlh->nlmsg_flags & NLM_F_REPLACE) {
                if (nft_rule_is_active_next(net, old_rule)) {
-                       repl = nf_tables_trans_add(old_rule, &ctx);
+                       repl = nf_tables_trans_add(&ctx, old_rule);
                        if (repl == NULL) {
                                err = -ENOMEM;
                                goto err2;
@@ -1694,7 +1705,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
                        list_add_rcu(&rule->list, &chain->rules);
        }
 
-       if (nf_tables_trans_add(rule, &ctx) == NULL) {
+       if (nf_tables_trans_add(&ctx, rule) == NULL) {
                err = -ENOMEM;
                goto err3;
        }
@@ -1709,7 +1720,7 @@ err3:
                kfree(repl);
        }
 err2:
-       nf_tables_rule_destroy(rule);
+       nf_tables_rule_destroy(&ctx, rule);
 err1:
        for (i = 0; i < n; i++) {
                if (info[i].ops != NULL)
@@ -1723,7 +1734,7 @@ nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule)
 {
        /* You cannot delete the same rule twice */
        if (nft_rule_is_active_next(ctx->net, rule)) {
-               if (nf_tables_trans_add(rule, ctx) == NULL)
+               if (nf_tables_trans_add(ctx, rule) == NULL)
                        return -ENOMEM;
                nft_rule_disactivate_next(ctx->net, rule);
                return 0;
@@ -1819,10 +1830,10 @@ static int nf_tables_commit(struct sk_buff *skb)
                 */
                if (nft_rule_is_active(net, rupd->rule)) {
                        nft_rule_clear(net, rupd->rule);
-                       nf_tables_rule_notify(skb, rupd->nlh, rupd->table,
-                                             rupd->chain, rupd->rule,
-                                             NFT_MSG_NEWRULE, 0,
-                                             rupd->family);
+                       nf_tables_rule_notify(skb, rupd->ctx.nlh,
+                                             rupd->ctx.table, rupd->ctx.chain,
+                                             rupd->rule, NFT_MSG_NEWRULE, 0,
+                                             rupd->ctx.afi->family);
                        list_del(&rupd->list);
                        kfree(rupd);
                        continue;
@@ -1830,9 +1841,10 @@ static int nf_tables_commit(struct sk_buff *skb)
 
                /* This rule is in the past, get rid of it */
                list_del_rcu(&rupd->rule->list);
-               nf_tables_rule_notify(skb, rupd->nlh, rupd->table, rupd->chain,
+               nf_tables_rule_notify(skb, rupd->ctx.nlh,
+                                     rupd->ctx.table, rupd->ctx.chain,
                                      rupd->rule, NFT_MSG_DELRULE, 0,
-                                     rupd->family);
+                                     rupd->ctx.afi->family);
        }
 
        /* Make sure we don't see any packet traversing old rules */
@@ -1840,7 +1852,7 @@ static int nf_tables_commit(struct sk_buff *skb)
 
        /* Now we can safely release unused old rules */
        list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
-               nf_tables_rule_destroy(rupd->rule);
+               nf_tables_rule_destroy(&rupd->ctx, rupd->rule);
                list_del(&rupd->list);
                kfree(rupd);
        }
@@ -1869,7 +1881,7 @@ static int nf_tables_abort(struct sk_buff *skb)
        synchronize_rcu();
 
        list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) {
-               nf_tables_rule_destroy(rupd->rule);
+               nf_tables_rule_destroy(&rupd->ctx, rupd->rule);
                list_del(&rupd->list);
                kfree(rupd);
        }
@@ -2430,8 +2442,7 @@ err1:
 static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
 {
        list_del(&set->list);
-       if (!(set->flags & NFT_SET_ANONYMOUS))
-               nf_tables_set_notify(ctx, set, NFT_MSG_DELSET);
+       nf_tables_set_notify(ctx, set, NFT_MSG_DELSET);
 
        set->ops->destroy(set);
        module_put(set->ops->owner);
@@ -3175,9 +3186,16 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
        data->verdict = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE]));
 
        switch (data->verdict) {
-       case NF_ACCEPT:
-       case NF_DROP:
-       case NF_QUEUE:
+       default:
+               switch (data->verdict & NF_VERDICT_MASK) {
+               case NF_ACCEPT:
+               case NF_DROP:
+               case NF_QUEUE:
+                       break;
+               default:
+                       return -EINVAL;
+               }
+               /* fall through */
        case NFT_CONTINUE:
        case NFT_BREAK:
        case NFT_RETURN:
@@ -3198,8 +3216,6 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
                data->chain = chain;
                desc->len = sizeof(data);
                break;
-       default:
-               return -EINVAL;
        }
 
        desc->type = NFT_DATA_VERDICT;
index 046aa13b4fea6a570b990444b7ca4c6419824f8f..e8138da4c14f70f40449c72ec4dc4d31f2960b8e 100644 (file)
@@ -61,6 +61,14 @@ void nfnl_unlock(__u8 subsys_id)
 }
 EXPORT_SYMBOL_GPL(nfnl_unlock);
 
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_nfnl_is_held(u8 subsys_id)
+{
+       return lockdep_is_held(&table[subsys_id].mutex);
+}
+EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held);
+#endif
+
 int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
 {
        nfnl_lock(n->subsys_id);
index a155d19a225edcfb4b1a550ebc898f40d51c564d..d292c8d286ebeac22f688d47df8b793e9c90cc6b 100644 (file)
@@ -28,8 +28,6 @@
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/list.h>
-#include <linux/jhash.h>
-#include <linux/random.h>
 #include <linux/slab.h>
 #include <net/sock.h>
 #include <net/netfilter/nf_log.h>
@@ -75,7 +73,6 @@ struct nfulnl_instance {
 };
 
 #define INSTANCE_BUCKETS       16
-static unsigned int hash_init;
 
 static int nfnl_log_net_id __read_mostly;
 
@@ -1067,11 +1064,6 @@ static int __init nfnetlink_log_init(void)
 {
        int status = -ENOMEM;
 
-       /* it's not really all that important to have a random value, so
-        * we can do this from the init function, even if there hasn't
-        * been that much entropy yet */
-       get_random_bytes(&hash_init, sizeof(hash_init));
-
        netlink_register_notifier(&nfulnl_rtnl_notifier);
        status = nfnetlink_subsys_register(&nfulnl_subsys);
        if (status < 0) {
index 82cb8236f8a101d260a9f3cc6eb1c218e34163f3..8a779be832fba2d8a86181f8f8f68a445299d626 100644 (file)
@@ -192,7 +192,7 @@ err:
 }
 
 static void
-nft_target_destroy(const struct nft_expr *expr)
+nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 {
        struct xt_target *target = expr->ops->data;
 
@@ -379,7 +379,7 @@ err:
 }
 
 static void
-nft_match_destroy(const struct nft_expr *expr)
+nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 {
        struct xt_match *match = expr->ops->data;
 
index 46e2754038387cf978679e2fd4f1032b30e40bd6..bd0d41e693416167b4f149f64117e440a5134496 100644 (file)
 #include <net/netfilter/nf_conntrack_tuple.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_labels.h>
 
 struct nft_ct {
        enum nft_ct_keys        key:8;
        enum ip_conntrack_dir   dir:8;
-       union{
+       union {
                enum nft_registers      dreg:8;
                enum nft_registers      sreg:8;
        };
-       uint8_t                 family;
 };
 
 static void nft_ct_get_eval(const struct nft_expr *expr,
@@ -97,6 +97,26 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
                        goto err;
                strncpy((char *)dest->data, helper->name, sizeof(dest->data));
                return;
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+       case NFT_CT_LABELS: {
+               struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+               unsigned int size;
+
+               if (!labels) {
+                       memset(dest->data, 0, sizeof(dest->data));
+                       return;
+               }
+
+               BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > sizeof(dest->data));
+               size = labels->words * sizeof(long);
+
+               memcpy(dest->data, labels->bits, size);
+               if (size < sizeof(dest->data))
+                       memset(((char *) dest->data) + size, 0,
+                              sizeof(dest->data) - size);
+               return;
+       }
+#endif
        }
 
        tuple = &ct->tuplehash[priv->dir].tuple;
@@ -220,6 +240,9 @@ static int nft_ct_init_validate_get(const struct nft_expr *expr,
 #endif
 #ifdef CONFIG_NF_CONNTRACK_SECMARK
        case NFT_CT_SECMARK:
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+       case NFT_CT_LABELS:
 #endif
        case NFT_CT_EXPIRATION:
        case NFT_CT_HELPER:
@@ -292,16 +315,13 @@ static int nft_ct_init(const struct nft_ctx *ctx,
        if (err < 0)
                return err;
 
-       priv->family = ctx->afi->family;
-
        return 0;
 }
 
-static void nft_ct_destroy(const struct nft_expr *expr)
+static void nft_ct_destroy(const struct nft_ctx *ctx,
+                          const struct nft_expr *expr)
 {
-       struct nft_ct *priv = nft_expr_priv(expr);
-
-       nft_ct_l3proto_module_put(priv->family);
+       nft_ct_l3proto_module_put(ctx->afi->family);
 }
 
 static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
index 3d3f8fce10a5136391e53bd75b13498a45e82e96..6a1acde16c607547f547cc493a2ac7a7f5feb022 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables.h>
 
+#define NFT_HASH_MIN_SIZE      4
+
 struct nft_hash {
-       struct hlist_head       *hash;
-       unsigned int            hsize;
+       struct nft_hash_table __rcu     *tbl;
+};
+
+struct nft_hash_table {
+       unsigned int                    size;
+       unsigned int                    elements;
+       struct nft_hash_elem __rcu      *buckets[];
 };
 
 struct nft_hash_elem {
-       struct hlist_node       hnode;
-       struct nft_data         key;
-       struct nft_data         data[];
+       struct nft_hash_elem __rcu      *next;
+       struct nft_data                 key;
+       struct nft_data                 data[];
 };
 
+#define nft_hash_for_each_entry(i, head) \
+       for (i = nft_dereference(head); i != NULL; i = nft_dereference(i->next))
+#define nft_hash_for_each_entry_rcu(i, head) \
+       for (i = rcu_dereference(head); i != NULL; i = rcu_dereference(i->next))
+
 static u32 nft_hash_rnd __read_mostly;
 static bool nft_hash_rnd_initted __read_mostly;
 
@@ -38,7 +50,7 @@ static unsigned int nft_hash_data(const struct nft_data *data,
        unsigned int h;
 
        h = jhash(data->data, len, nft_hash_rnd);
-       return ((u64)h * hsize) >> 32;
+       return h & (hsize - 1);
 }
 
 static bool nft_hash_lookup(const struct nft_set *set,
@@ -46,11 +58,12 @@ static bool nft_hash_lookup(const struct nft_set *set,
                            struct nft_data *data)
 {
        const struct nft_hash *priv = nft_set_priv(set);
+       const struct nft_hash_table *tbl = rcu_dereference(priv->tbl);
        const struct nft_hash_elem *he;
        unsigned int h;
 
-       h = nft_hash_data(key, priv->hsize, set->klen);
-       hlist_for_each_entry(he, &priv->hash[h], hnode) {
+       h = nft_hash_data(key, tbl->size, set->klen);
+       nft_hash_for_each_entry_rcu(he, tbl->buckets[h]) {
                if (nft_data_cmp(&he->key, key, set->klen))
                        continue;
                if (set->flags & NFT_SET_MAP)
@@ -60,19 +73,148 @@ static bool nft_hash_lookup(const struct nft_set *set,
        return false;
 }
 
-static void nft_hash_elem_destroy(const struct nft_set *set,
-                                 struct nft_hash_elem *he)
+static void nft_hash_tbl_free(const struct nft_hash_table *tbl)
 {
-       nft_data_uninit(&he->key, NFT_DATA_VALUE);
-       if (set->flags & NFT_SET_MAP)
-               nft_data_uninit(he->data, set->dtype);
-       kfree(he);
+       if (is_vmalloc_addr(tbl))
+               vfree(tbl);
+       else
+               kfree(tbl);
+}
+
+static struct nft_hash_table *nft_hash_tbl_alloc(unsigned int nbuckets)
+{
+       struct nft_hash_table *tbl;
+       size_t size;
+
+       size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
+       tbl = kzalloc(size, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN);
+       if (tbl == NULL)
+               tbl = vzalloc(size);
+       if (tbl == NULL)
+               return NULL;
+       tbl->size = nbuckets;
+
+       return tbl;
+}
+
+static void nft_hash_chain_unzip(const struct nft_set *set,
+                                const struct nft_hash_table *ntbl,
+                                struct nft_hash_table *tbl, unsigned int n)
+{
+       struct nft_hash_elem *he, *last, *next;
+       unsigned int h;
+
+       he = nft_dereference(tbl->buckets[n]);
+       if (he == NULL)
+               return;
+       h = nft_hash_data(&he->key, ntbl->size, set->klen);
+
+       /* Find last element of first chain hashing to bucket h */
+       last = he;
+       nft_hash_for_each_entry(he, he->next) {
+               if (nft_hash_data(&he->key, ntbl->size, set->klen) != h)
+                       break;
+               last = he;
+       }
+
+       /* Unlink first chain from the old table */
+       RCU_INIT_POINTER(tbl->buckets[n], last->next);
+
+       /* If end of chain reached, done */
+       if (he == NULL)
+               return;
+
+       /* Find first element of second chain hashing to bucket h */
+       next = NULL;
+       nft_hash_for_each_entry(he, he->next) {
+               if (nft_hash_data(&he->key, ntbl->size, set->klen) != h)
+                       continue;
+               next = he;
+               break;
+       }
+
+       /* Link the two chains */
+       RCU_INIT_POINTER(last->next, next);
+}
+
+static int nft_hash_tbl_expand(const struct nft_set *set, struct nft_hash *priv)
+{
+       struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl;
+       struct nft_hash_elem *he;
+       unsigned int i, h;
+       bool complete;
+
+       ntbl = nft_hash_tbl_alloc(tbl->size * 2);
+       if (ntbl == NULL)
+               return -ENOMEM;
+
+       /* Link new table's buckets to first element in the old table
+        * hashing to the new bucket.
+        */
+       for (i = 0; i < ntbl->size; i++) {
+               h = i < tbl->size ? i : i - tbl->size;
+               nft_hash_for_each_entry(he, tbl->buckets[h]) {
+                       if (nft_hash_data(&he->key, ntbl->size, set->klen) != i)
+                               continue;
+                       RCU_INIT_POINTER(ntbl->buckets[i], he);
+                       break;
+               }
+       }
+       ntbl->elements = tbl->elements;
+
+       /* Publish new table */
+       rcu_assign_pointer(priv->tbl, ntbl);
+
+       /* Unzip interleaved hash chains */
+       do {
+               /* Wait for readers to use new table/unzipped chains */
+               synchronize_rcu();
+
+               complete = true;
+               for (i = 0; i < tbl->size; i++) {
+                       nft_hash_chain_unzip(set, ntbl, tbl, i);
+                       if (tbl->buckets[i] != NULL)
+                               complete = false;
+               }
+       } while (!complete);
+
+       nft_hash_tbl_free(tbl);
+       return 0;
+}
+
+static int nft_hash_tbl_shrink(const struct nft_set *set, struct nft_hash *priv)
+{
+       struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl;
+       struct nft_hash_elem __rcu **pprev;
+       unsigned int i;
+
+       ntbl = nft_hash_tbl_alloc(tbl->size / 2);
+       if (ntbl == NULL)
+               return -ENOMEM;
+
+       for (i = 0; i < ntbl->size; i++) {
+               ntbl->buckets[i] = tbl->buckets[i];
+
+               for (pprev = &ntbl->buckets[i]; *pprev != NULL;
+                    pprev = &nft_dereference(*pprev)->next)
+                       ;
+               RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]);
+       }
+       ntbl->elements = tbl->elements;
+
+       /* Publish new table */
+       rcu_assign_pointer(priv->tbl, ntbl);
+       synchronize_rcu();
+
+       nft_hash_tbl_free(tbl);
+       return 0;
 }
 
 static int nft_hash_insert(const struct nft_set *set,
                           const struct nft_set_elem *elem)
 {
        struct nft_hash *priv = nft_set_priv(set);
+       struct nft_hash_table *tbl = nft_dereference(priv->tbl);
        struct nft_hash_elem *he;
        unsigned int size, h;
 
@@ -91,33 +233,66 @@ static int nft_hash_insert(const struct nft_set *set,
        if (set->flags & NFT_SET_MAP)
                nft_data_copy(he->data, &elem->data);
 
-       h = nft_hash_data(&he->key, priv->hsize, set->klen);
-       hlist_add_head_rcu(&he->hnode, &priv->hash[h]);
+       h = nft_hash_data(&he->key, tbl->size, set->klen);
+       RCU_INIT_POINTER(he->next, tbl->buckets[h]);
+       rcu_assign_pointer(tbl->buckets[h], he);
+       tbl->elements++;
+
+       /* Expand table when exceeding 75% load */
+       if (tbl->elements > tbl->size / 4 * 3)
+               nft_hash_tbl_expand(set, priv);
+
        return 0;
 }
 
+static void nft_hash_elem_destroy(const struct nft_set *set,
+                                 struct nft_hash_elem *he)
+{
+       nft_data_uninit(&he->key, NFT_DATA_VALUE);
+       if (set->flags & NFT_SET_MAP)
+               nft_data_uninit(he->data, set->dtype);
+       kfree(he);
+}
+
 static void nft_hash_remove(const struct nft_set *set,
                            const struct nft_set_elem *elem)
 {
-       struct nft_hash_elem *he = elem->cookie;
+       struct nft_hash *priv = nft_set_priv(set);
+       struct nft_hash_table *tbl = nft_dereference(priv->tbl);
+       struct nft_hash_elem *he, __rcu **pprev;
 
-       hlist_del_rcu(&he->hnode);
+       pprev = elem->cookie;
+       he = nft_dereference((*pprev));
+
+       RCU_INIT_POINTER(*pprev, he->next);
+       synchronize_rcu();
        kfree(he);
+       tbl->elements--;
+
+       /* Shrink table beneath 30% load */
+       if (tbl->elements < tbl->size * 3 / 10 &&
+           tbl->size > NFT_HASH_MIN_SIZE)
+               nft_hash_tbl_shrink(set, priv);
 }
 
 static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
 {
        const struct nft_hash *priv = nft_set_priv(set);
+       const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
+       struct nft_hash_elem __rcu * const *pprev;
        struct nft_hash_elem *he;
        unsigned int h;
 
-       h = nft_hash_data(&elem->key, priv->hsize, set->klen);
-       hlist_for_each_entry(he, &priv->hash[h], hnode) {
-               if (nft_data_cmp(&he->key, &elem->key, set->klen))
+       h = nft_hash_data(&elem->key, tbl->size, set->klen);
+       pprev = &tbl->buckets[h];
+       nft_hash_for_each_entry(he, tbl->buckets[h]) {
+               if (nft_data_cmp(&he->key, &elem->key, set->klen)) {
+                       pprev = &he->next;
                        continue;
+               }
 
-               elem->cookie = he;
-               elem->flags  = 0;
+               elem->cookie = (void *)pprev;
+               elem->flags = 0;
                if (set->flags & NFT_SET_MAP)
                        nft_data_copy(&elem->data, he->data);
                return 0;
@@ -129,12 +304,13 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
                          struct nft_set_iter *iter)
 {
        const struct nft_hash *priv = nft_set_priv(set);
+       const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
        const struct nft_hash_elem *he;
        struct nft_set_elem elem;
        unsigned int i;
 
-       for (i = 0; i < priv->hsize; i++) {
-               hlist_for_each_entry(he, &priv->hash[i], hnode) {
+       for (i = 0; i < tbl->size; i++) {
+               nft_hash_for_each_entry(he, tbl->buckets[i]) {
                        if (iter->count < iter->skip)
                                goto cont;
 
@@ -161,43 +337,35 @@ static int nft_hash_init(const struct nft_set *set,
                         const struct nlattr * const tb[])
 {
        struct nft_hash *priv = nft_set_priv(set);
-       unsigned int cnt, i;
+       struct nft_hash_table *tbl;
 
        if (unlikely(!nft_hash_rnd_initted)) {
                get_random_bytes(&nft_hash_rnd, 4);
                nft_hash_rnd_initted = true;
        }
 
-       /* Aim for a load factor of 0.75 */
-       // FIXME: temporarily broken until we have set descriptions
-       cnt = 100;
-       cnt = cnt * 4 / 3;
-
-       priv->hash = kcalloc(cnt, sizeof(struct hlist_head), GFP_KERNEL);
-       if (priv->hash == NULL)
+       tbl = nft_hash_tbl_alloc(NFT_HASH_MIN_SIZE);
+       if (tbl == NULL)
                return -ENOMEM;
-       priv->hsize = cnt;
-
-       for (i = 0; i < cnt; i++)
-               INIT_HLIST_HEAD(&priv->hash[i]);
-
+       RCU_INIT_POINTER(priv->tbl, tbl);
        return 0;
 }
 
 static void nft_hash_destroy(const struct nft_set *set)
 {
        const struct nft_hash *priv = nft_set_priv(set);
-       const struct hlist_node *next;
-       struct nft_hash_elem *elem;
+       const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
+       struct nft_hash_elem *he, *next;
        unsigned int i;
 
-       for (i = 0; i < priv->hsize; i++) {
-               hlist_for_each_entry_safe(elem, next, &priv->hash[i], hnode) {
-                       hlist_del(&elem->hnode);
-                       nft_hash_elem_destroy(set, elem);
+       for (i = 0; i < tbl->size; i++) {
+               for (he = nft_dereference(tbl->buckets[i]); he != NULL;
+                    he = next) {
+                       next = nft_dereference(he->next);
+                       nft_hash_elem_destroy(set, he);
                }
        }
-       kfree(priv->hash);
+       kfree(tbl);
 }
 
 static struct nft_set_ops nft_hash_ops __read_mostly = {
index f169501f1ad4389970613d1e44fb2b60759406cb..810385eb7249c7be39bcbc0e6c34abc1a6b1708d 100644 (file)
@@ -70,7 +70,8 @@ err1:
        return err;
 }
 
-static void nft_immediate_destroy(const struct nft_expr *expr)
+static void nft_immediate_destroy(const struct nft_ctx *ctx,
+                                 const struct nft_expr *expr)
 {
        const struct nft_immediate_expr *priv = nft_expr_priv(expr);
        return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg));
index 26c5154e05f3fc09aae2ebdea968ce564a3d5eb9..10cfb156cdf4449dee49e53667549e5c2fccdf82 100644 (file)
@@ -74,7 +74,8 @@ static int nft_log_init(const struct nft_ctx *ctx,
        return 0;
 }
 
-static void nft_log_destroy(const struct nft_expr *expr)
+static void nft_log_destroy(const struct nft_ctx *ctx,
+                           const struct nft_expr *expr)
 {
        struct nft_log *priv = nft_expr_priv(expr);
 
index bb4ef4cccb6efcdf937bc3417eb8dbd6ccf0e22f..7fd2bea8aa239f347dc461c7bc45869dac405573 100644 (file)
@@ -89,11 +89,12 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
        return 0;
 }
 
-static void nft_lookup_destroy(const struct nft_expr *expr)
+static void nft_lookup_destroy(const struct nft_ctx *ctx,
+                              const struct nft_expr *expr)
 {
        struct nft_lookup *priv = nft_expr_priv(expr);
 
-       nf_tables_unbind_set(NULL, priv->set, &priv->binding);
+       nf_tables_unbind_set(ctx, priv->set, &priv->binding);
 }
 
 static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
index d3b1ffe26181b22538ab29fefadd7a41d96b76f1..a0195d28bcfc2e5b90025ac9ceb14499d1747934 100644 (file)
@@ -31,8 +31,8 @@ struct nft_nat {
        enum nft_registers      sreg_addr_max:8;
        enum nft_registers      sreg_proto_min:8;
        enum nft_registers      sreg_proto_max:8;
-       int                     family;
-       enum nf_nat_manip_type  type;
+       enum nf_nat_manip_type  type:8;
+       u8                      family;
 };
 
 static void nft_nat_eval(const struct nft_expr *expr,
@@ -88,6 +88,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
                        const struct nlattr * const tb[])
 {
        struct nft_nat *priv = nft_expr_priv(expr);
+       u32 family;
        int err;
 
        if (tb[NFTA_NAT_TYPE] == NULL)
@@ -107,9 +108,12 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
        if (tb[NFTA_NAT_FAMILY] == NULL)
                return -EINVAL;
 
-       priv->family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY]));
-       if (priv->family != AF_INET && priv->family != AF_INET6)
-               return -EINVAL;
+       family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY]));
+       if (family != AF_INET && family != AF_INET6)
+               return -EAFNOSUPPORT;
+       if (family != ctx->afi->family)
+               return -EOPNOTSUPP;
+       priv->family = family;
 
        if (tb[NFTA_NAT_REG_ADDR_MIN]) {
                priv->sreg_addr_min = ntohl(nla_get_be32(
@@ -202,13 +206,7 @@ static struct nft_expr_type nft_nat_type __read_mostly = {
 
 static int __init nft_nat_module_init(void)
 {
-       int err;
-
-       err = nft_register_expr(&nft_nat_type);
-       if (err < 0)
-               return err;
-
-       return 0;
+       return nft_register_expr(&nft_nat_type);
 }
 
 static void __exit nft_nat_module_exit(void)
index 3228d7f24eb4107ff717af4323e834af3e680797..4973cbddc446bd50a377765bec23128716ff630e 100644 (file)
@@ -146,11 +146,11 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
 
                if (par->family == NFPROTO_BRIDGE) {
                        switch (eth_hdr(skb)->h_proto) {
-                       case __constant_htons(ETH_P_IP):
+                       case htons(ETH_P_IP):
                                audit_ip4(ab, skb);
                                break;
 
-                       case __constant_htons(ETH_P_IPV6):
+                       case htons(ETH_P_IPV6):
                                audit_ip6(ab, skb);
                                break;
                        }
index c40b2695633b15960ac65eb84ebc2405b4aa2c1c..458464e7bd7a841915aeebdd448cb17bb49c01e4 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/jhash.h>
 #include <linux/slab.h>
 #include <linux/list.h>
+#include <linux/rbtree.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/skbuff.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 
+#define CONNLIMIT_SLOTS                32
+#define CONNLIMIT_LOCK_SLOTS   32
+#define CONNLIMIT_GC_MAX_NODES 8
+
 /* we will save the tuples of all connections we care about */
 struct xt_connlimit_conn {
        struct hlist_node               node;
@@ -38,16 +43,26 @@ struct xt_connlimit_conn {
        union nf_inet_addr              addr;
 };
 
+struct xt_connlimit_rb {
+       struct rb_node node;
+       struct hlist_head hhead; /* connections/hosts in same subnet */
+       union nf_inet_addr addr; /* search key */
+};
+
 struct xt_connlimit_data {
-       struct hlist_head       iphash[256];
-       spinlock_t              lock;
+       struct rb_root climit_root4[CONNLIMIT_SLOTS];
+       struct rb_root climit_root6[CONNLIMIT_SLOTS];
+       spinlock_t              locks[CONNLIMIT_LOCK_SLOTS];
 };
 
 static u_int32_t connlimit_rnd __read_mostly;
+static struct kmem_cache *connlimit_rb_cachep __read_mostly;
+static struct kmem_cache *connlimit_conn_cachep __read_mostly;
 
 static inline unsigned int connlimit_iphash(__be32 addr)
 {
-       return jhash_1word((__force __u32)addr, connlimit_rnd) & 0xFF;
+       return jhash_1word((__force __u32)addr,
+                           connlimit_rnd) % CONNLIMIT_SLOTS;
 }
 
 static inline unsigned int
@@ -60,7 +75,8 @@ connlimit_iphash6(const union nf_inet_addr *addr,
        for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i)
                res.ip6[i] = addr->ip6[i] & mask->ip6[i];
 
-       return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), connlimit_rnd) & 0xFF;
+       return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6),
+                      connlimit_rnd) % CONNLIMIT_SLOTS;
 }
 
 static inline bool already_closed(const struct nf_conn *conn)
@@ -72,13 +88,14 @@ static inline bool already_closed(const struct nf_conn *conn)
                return 0;
 }
 
-static inline unsigned int
+static int
 same_source_net(const union nf_inet_addr *addr,
                const union nf_inet_addr *mask,
                const union nf_inet_addr *u3, u_int8_t family)
 {
        if (family == NFPROTO_IPV4) {
-               return (addr->ip & mask->ip) == (u3->ip & mask->ip);
+               return ntohl(addr->ip & mask->ip) -
+                      ntohl(u3->ip & mask->ip);
        } else {
                union nf_inet_addr lh, rh;
                unsigned int i;
@@ -88,89 +105,205 @@ same_source_net(const union nf_inet_addr *addr,
                        rh.ip6[i] = u3->ip6[i] & mask->ip6[i];
                }
 
-               return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)) == 0;
+               return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6));
        }
 }
 
-static int count_them(struct net *net,
-                     struct xt_connlimit_data *data,
+static bool add_hlist(struct hlist_head *head,
                      const struct nf_conntrack_tuple *tuple,
-                     const union nf_inet_addr *addr,
-                     const union nf_inet_addr *mask,
-                     u_int8_t family)
+                     const union nf_inet_addr *addr)
+{
+       struct xt_connlimit_conn *conn;
+
+       conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC);
+       if (conn == NULL)
+               return false;
+       conn->tuple = *tuple;
+       conn->addr = *addr;
+       hlist_add_head(&conn->node, head);
+       return true;
+}
+
+static unsigned int check_hlist(struct net *net,
+                               struct hlist_head *head,
+                               const struct nf_conntrack_tuple *tuple,
+                               bool *addit)
 {
        const struct nf_conntrack_tuple_hash *found;
        struct xt_connlimit_conn *conn;
        struct hlist_node *n;
        struct nf_conn *found_ct;
-       struct hlist_head *hash;
-       bool addit = true;
-       int matches = 0;
-
-       if (family == NFPROTO_IPV6)
-               hash = &data->iphash[connlimit_iphash6(addr, mask)];
-       else
-               hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)];
+       unsigned int length = 0;
 
+       *addit = true;
        rcu_read_lock();
 
        /* check the saved connections */
-       hlist_for_each_entry_safe(conn, n, hash, node) {
+       hlist_for_each_entry_safe(conn, n, head, node) {
                found    = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE,
                                                 &conn->tuple);
-               found_ct = NULL;
+               if (found == NULL) {
+                       hlist_del(&conn->node);
+                       kmem_cache_free(connlimit_conn_cachep, conn);
+                       continue;
+               }
 
-               if (found != NULL)
-                       found_ct = nf_ct_tuplehash_to_ctrack(found);
+               found_ct = nf_ct_tuplehash_to_ctrack(found);
 
-               if (found_ct != NULL &&
-                   nf_ct_tuple_equal(&conn->tuple, tuple) &&
-                   !already_closed(found_ct))
+               if (nf_ct_tuple_equal(&conn->tuple, tuple)) {
                        /*
                         * Just to be sure we have it only once in the list.
                         * We should not see tuples twice unless someone hooks
                         * this into a table without "-p tcp --syn".
                         */
-                       addit = false;
-
-               if (found == NULL) {
-                       /* this one is gone */
-                       hlist_del(&conn->node);
-                       kfree(conn);
-                       continue;
-               }
-
-               if (already_closed(found_ct)) {
+                       *addit = false;
+               } else if (already_closed(found_ct)) {
                        /*
                         * we do not care about connections which are
                         * closed already -> ditch it
                         */
                        nf_ct_put(found_ct);
                        hlist_del(&conn->node);
-                       kfree(conn);
+                       kmem_cache_free(connlimit_conn_cachep, conn);
                        continue;
                }
 
-               if (same_source_net(addr, mask, &conn->addr, family))
-                       /* same source network -> be counted! */
-                       ++matches;
                nf_ct_put(found_ct);
+               length++;
        }
 
        rcu_read_unlock();
 
-       if (addit) {
-               /* save the new connection in our list */
-               conn = kmalloc(sizeof(*conn), GFP_ATOMIC);
-               if (conn == NULL)
-                       return -ENOMEM;
-               conn->tuple = *tuple;
-               conn->addr = *addr;
-               hlist_add_head(&conn->node, hash);
-               ++matches;
+       return length;
+}
+
+static void tree_nodes_free(struct rb_root *root,
+                           struct xt_connlimit_rb *gc_nodes[],
+                           unsigned int gc_count)
+{
+       struct xt_connlimit_rb *rbconn;
+
+       while (gc_count) {
+               rbconn = gc_nodes[--gc_count];
+               rb_erase(&rbconn->node, root);
+               kmem_cache_free(connlimit_rb_cachep, rbconn);
+       }
+}
+
+static unsigned int
+count_tree(struct net *net, struct rb_root *root,
+          const struct nf_conntrack_tuple *tuple,
+          const union nf_inet_addr *addr, const union nf_inet_addr *mask,
+          u8 family)
+{
+       struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES];
+       struct rb_node **rbnode, *parent;
+       struct xt_connlimit_rb *rbconn;
+       struct xt_connlimit_conn *conn;
+       unsigned int gc_count;
+       bool no_gc = false;
+
+ restart:
+       gc_count = 0;
+       parent = NULL;
+       rbnode = &(root->rb_node);
+       while (*rbnode) {
+               int diff;
+               bool addit;
+
+               rbconn = container_of(*rbnode, struct xt_connlimit_rb, node);
+
+               parent = *rbnode;
+               diff = same_source_net(addr, mask, &rbconn->addr, family);
+               if (diff < 0) {
+                       rbnode = &((*rbnode)->rb_left);
+               } else if (diff > 0) {
+                       rbnode = &((*rbnode)->rb_right);
+               } else {
+                       /* same source network -> be counted! */
+                       unsigned int count;
+                       count = check_hlist(net, &rbconn->hhead, tuple, &addit);
+
+                       tree_nodes_free(root, gc_nodes, gc_count);
+                       if (!addit)
+                               return count;
+
+                       if (!add_hlist(&rbconn->hhead, tuple, addr))
+                               return 0; /* hotdrop */
+
+                       return count + 1;
+               }
+
+               if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
+                       continue;
+
+               /* only used for GC on hhead, retval and 'addit' ignored */
+               check_hlist(net, &rbconn->hhead, tuple, &addit);
+               if (hlist_empty(&rbconn->hhead))
+                       gc_nodes[gc_count++] = rbconn;
+       }
+
+       if (gc_count) {
+               no_gc = true;
+               tree_nodes_free(root, gc_nodes, gc_count);
+               /* tree_node_free before new allocation permits
+                * allocator to re-use newly free'd object.
+                *
+                * This is a rare event; in most cases we will find
+                * existing node to re-use. (or gc_count is 0).
+                */
+               goto restart;
+       }
+
+       /* no match, need to insert new node */
+       rbconn = kmem_cache_alloc(connlimit_rb_cachep, GFP_ATOMIC);
+       if (rbconn == NULL)
+               return 0;
+
+       conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC);
+       if (conn == NULL) {
+               kmem_cache_free(connlimit_rb_cachep, rbconn);
+               return 0;
+       }
+
+       conn->tuple = *tuple;
+       conn->addr = *addr;
+       rbconn->addr = *addr;
+
+       INIT_HLIST_HEAD(&rbconn->hhead);
+       hlist_add_head(&conn->node, &rbconn->hhead);
+
+       rb_link_node(&rbconn->node, parent, rbnode);
+       rb_insert_color(&rbconn->node, root);
+       return 1;
+}
+
+static int count_them(struct net *net,
+                     struct xt_connlimit_data *data,
+                     const struct nf_conntrack_tuple *tuple,
+                     const union nf_inet_addr *addr,
+                     const union nf_inet_addr *mask,
+                     u_int8_t family)
+{
+       struct rb_root *root;
+       int count;
+       u32 hash;
+
+       if (family == NFPROTO_IPV6) {
+               hash = connlimit_iphash6(addr, mask);
+               root = &data->climit_root6[hash];
+       } else {
+               hash = connlimit_iphash(addr->ip & mask->ip);
+               root = &data->climit_root4[hash];
        }
 
-       return matches;
+       spin_lock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]);
+
+       count = count_tree(net, root, tuple, addr, mask, family);
+
+       spin_unlock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]);
+
+       return count;
 }
 
 static bool
@@ -183,7 +316,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
        const struct nf_conntrack_tuple *tuple_ptr = &tuple;
        enum ip_conntrack_info ctinfo;
        const struct nf_conn *ct;
-       int connections;
+       unsigned int connections;
 
        ct = nf_ct_get(skb, &ctinfo);
        if (ct != NULL)
@@ -202,12 +335,9 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
                          iph->daddr : iph->saddr;
        }
 
-       spin_lock_bh(&info->data->lock);
        connections = count_them(net, info->data, tuple_ptr, &addr,
                                 &info->mask, par->family);
-       spin_unlock_bh(&info->data->lock);
-
-       if (connections < 0)
+       if (connections == 0)
                /* kmalloc failed, drop it entirely */
                goto hotdrop;
 
@@ -247,29 +377,47 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
                return -ENOMEM;
        }
 
-       spin_lock_init(&info->data->lock);
-       for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i)
-               INIT_HLIST_HEAD(&info->data->iphash[i]);
+       for (i = 0; i < ARRAY_SIZE(info->data->locks); ++i)
+               spin_lock_init(&info->data->locks[i]);
+
+       for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
+               info->data->climit_root4[i] = RB_ROOT;
+       for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i)
+               info->data->climit_root6[i] = RB_ROOT;
 
        return 0;
 }
 
-static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
+static void destroy_tree(struct rb_root *r)
 {
-       const struct xt_connlimit_info *info = par->matchinfo;
        struct xt_connlimit_conn *conn;
+       struct xt_connlimit_rb *rbconn;
        struct hlist_node *n;
-       struct hlist_head *hash = info->data->iphash;
+       struct rb_node *node;
+
+       while ((node = rb_first(r)) != NULL) {
+               rbconn = container_of(node, struct xt_connlimit_rb, node);
+
+               rb_erase(node, r);
+
+               hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node)
+                       kmem_cache_free(connlimit_conn_cachep, conn);
+
+               kmem_cache_free(connlimit_rb_cachep, rbconn);
+       }
+}
+
+static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
+{
+       const struct xt_connlimit_info *info = par->matchinfo;
        unsigned int i;
 
        nf_ct_l3proto_module_put(par->family);
 
-       for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) {
-               hlist_for_each_entry_safe(conn, n, &hash[i], node) {
-                       hlist_del(&conn->node);
-                       kfree(conn);
-               }
-       }
+       for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
+               destroy_tree(&info->data->climit_root4[i]);
+       for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i)
+               destroy_tree(&info->data->climit_root6[i]);
 
        kfree(info->data);
 }
@@ -287,12 +435,37 @@ static struct xt_match connlimit_mt_reg __read_mostly = {
 
 static int __init connlimit_mt_init(void)
 {
-       return xt_register_match(&connlimit_mt_reg);
+       int ret;
+
+       BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS);
+       BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0);
+
+       connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn",
+                                          sizeof(struct xt_connlimit_conn),
+                                          0, 0, NULL);
+       if (!connlimit_conn_cachep)
+               return -ENOMEM;
+
+       connlimit_rb_cachep = kmem_cache_create("xt_connlimit_rb",
+                                          sizeof(struct xt_connlimit_rb),
+                                          0, 0, NULL);
+       if (!connlimit_rb_cachep) {
+               kmem_cache_destroy(connlimit_conn_cachep);
+               return -ENOMEM;
+       }
+       ret = xt_register_match(&connlimit_mt_reg);
+       if (ret != 0) {
+               kmem_cache_destroy(connlimit_conn_cachep);
+               kmem_cache_destroy(connlimit_rb_cachep);
+       }
+       return ret;
 }
 
 static void __exit connlimit_mt_exit(void)
 {
        xt_unregister_match(&connlimit_mt_reg);
+       kmem_cache_destroy(connlimit_conn_cachep);
+       kmem_cache_destroy(connlimit_rb_cachep);
 }
 
 module_init(connlimit_mt_init);
index a4c7561698c5c2fa5dd9b79717e0cc7116347e54..89d53104c6b365b12c76ff684064bc5d032656c3 100644 (file)
@@ -60,7 +60,7 @@ static bool comp_mt(const struct sk_buff *skb, struct xt_action_param *par)
        }
 
        return spi_match(compinfo->spis[0], compinfo->spis[1],
-                        ntohl(chdr->cpi << 16),
+                        ntohs(chdr->cpi),
                         !!(compinfo->invflags & XT_IPCOMP_INV_SPI));
 }
 
index a366537f82c6b7981472cdcc51e977a139f3b540..63a3ce75c02ee959fe67d6b203e01420d86b03ad 100644 (file)
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
 
-#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *))
+#define HTSIZE 256
 
 struct fw_head {
-       struct fw_filter *ht[HTSIZE];
-       u32 mask;
+       u32                     mask;
+       struct fw_filter        *ht[HTSIZE];
 };
 
 struct fw_filter {
@@ -46,30 +46,11 @@ struct fw_filter {
        struct tcf_exts         exts;
 };
 
-static inline int fw_hash(u32 handle)
+static u32 fw_hash(u32 handle)
 {
-       if (HTSIZE == 4096)
-               return ((handle >> 24) & 0xFFF) ^
-                      ((handle >> 12) & 0xFFF) ^
-                      (handle & 0xFFF);
-       else if (HTSIZE == 2048)
-               return ((handle >> 22) & 0x7FF) ^
-                      ((handle >> 11) & 0x7FF) ^
-                      (handle & 0x7FF);
-       else if (HTSIZE == 1024)
-               return ((handle >> 20) & 0x3FF) ^
-                      ((handle >> 10) & 0x3FF) ^
-                      (handle & 0x3FF);
-       else if (HTSIZE == 512)
-               return (handle >> 27) ^
-                      ((handle >> 18) & 0x1FF) ^
-                      ((handle >> 9) & 0x1FF) ^
-                      (handle & 0x1FF);
-       else if (HTSIZE == 256) {
-               u8 *t = (u8 *) &handle;
-               return t[0] ^ t[1] ^ t[2] ^ t[3];
-       } else
-               return handle & (HTSIZE - 1);
+       handle ^= (handle >> 16);
+       handle ^= (handle >> 8);
+       return handle % HTSIZE;
 }
 
 static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
index 4218164f4f5e488eca0242210e5bdc270c21a433..85d1d476461257b3e248bd870cd9f7217fa21657 100644 (file)
 
 static struct kmem_cache *secpath_cachep __read_mostly;
 
+static DEFINE_SPINLOCK(xfrm_input_afinfo_lock);
+static struct xfrm_input_afinfo __rcu *xfrm_input_afinfo[NPROTO];
+
+int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo)
+{
+       int err = 0;
+
+       if (unlikely(afinfo == NULL))
+               return -EINVAL;
+       if (unlikely(afinfo->family >= NPROTO))
+               return -EAFNOSUPPORT;
+       spin_lock_bh(&xfrm_input_afinfo_lock);
+       if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL))
+               err = -ENOBUFS;
+       else
+               rcu_assign_pointer(xfrm_input_afinfo[afinfo->family], afinfo);
+       spin_unlock_bh(&xfrm_input_afinfo_lock);
+       return err;
+}
+EXPORT_SYMBOL(xfrm_input_register_afinfo);
+
+int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo)
+{
+       int err = 0;
+
+       if (unlikely(afinfo == NULL))
+               return -EINVAL;
+       if (unlikely(afinfo->family >= NPROTO))
+               return -EAFNOSUPPORT;
+       spin_lock_bh(&xfrm_input_afinfo_lock);
+       if (likely(xfrm_input_afinfo[afinfo->family] != NULL)) {
+               if (unlikely(xfrm_input_afinfo[afinfo->family] != afinfo))
+                       err = -EINVAL;
+               else
+                       RCU_INIT_POINTER(xfrm_input_afinfo[afinfo->family], NULL);
+       }
+       spin_unlock_bh(&xfrm_input_afinfo_lock);
+       synchronize_rcu();
+       return err;
+}
+EXPORT_SYMBOL(xfrm_input_unregister_afinfo);
+
+static struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family)
+{
+       struct xfrm_input_afinfo *afinfo;
+
+       if (unlikely(family >= NPROTO))
+               return NULL;
+       rcu_read_lock();
+       afinfo = rcu_dereference(xfrm_input_afinfo[family]);
+       if (unlikely(!afinfo))
+               rcu_read_unlock();
+       return afinfo;
+}
+
+static void xfrm_input_put_afinfo(struct xfrm_input_afinfo *afinfo)
+{
+       rcu_read_unlock();
+}
+
+static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol,
+                      int err)
+{
+       int ret;
+       struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family);
+
+       if (!afinfo)
+               return -EAFNOSUPPORT;
+
+       ret = afinfo->callback(skb, protocol, err);
+       xfrm_input_put_afinfo(afinfo);
+
+       return ret;
+}
+
 void __secpath_destroy(struct sec_path *sp)
 {
        int i;
index 06970fee91552ecfd04f502588cd5b62e2b10be9..8e9c781a6bbaaba83e4af4a31ac7a07a70dd6c71 100644 (file)
@@ -1609,7 +1609,7 @@ unlock:
 EXPORT_SYMBOL(xfrm_alloc_spi);
 
 static bool __xfrm_state_filter_match(struct xfrm_state *x,
-                                     struct xfrm_filter *filter)
+                                     struct xfrm_address_filter *filter)
 {
        if (filter) {
                if ((filter->family == AF_INET ||
@@ -1668,7 +1668,7 @@ out:
 EXPORT_SYMBOL(xfrm_state_walk);
 
 void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto,
-                         struct xfrm_filter *filter)
+                         struct xfrm_address_filter *filter)
 {
        INIT_LIST_HEAD(&walk->all);
        walk->proto = proto;
index 195dbe230b982483e900dd3950c6fb9c3c4d530d..cdd9e9c7ff0e10a9669a86ef308bead8bfbc525e 100644 (file)
@@ -899,7 +899,7 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
 
        if (!cb->args[0]) {
                struct nlattr *attrs[XFRMA_MAX+1];
-               struct xfrm_filter *filter = NULL;
+               struct xfrm_address_filter *filter = NULL;
                u8 proto = 0;
                int err;
 
@@ -910,12 +910,12 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
                if (err < 0)
                        return err;
 
-               if (attrs[XFRMA_FILTER]) {
+               if (attrs[XFRMA_ADDRESS_FILTER]) {
                        filter = kmalloc(sizeof(*filter), GFP_KERNEL);
                        if (filter == NULL)
                                return -ENOMEM;
 
-                       memcpy(filter, nla_data(attrs[XFRMA_FILTER]),
+                       memcpy(filter, nla_data(attrs[XFRMA_ADDRESS_FILTER]),
                               sizeof(*filter));
                }
 
@@ -2329,7 +2329,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
        [XFRMA_REPLAY_ESN_VAL]  = { .len = sizeof(struct xfrm_replay_state_esn) },
        [XFRMA_SA_EXTRA_FLAGS]  = { .type = NLA_U32 },
        [XFRMA_PROTO]           = { .type = NLA_U8 },
-       [XFRMA_FILTER]          = { .len = sizeof(struct xfrm_filter) },
+       [XFRMA_ADDRESS_FILTER]  = { .len = sizeof(struct xfrm_address_filter) },
 };
 
 static const struct xfrm_link {