packages: kernel/kernel-imq.patch, kernel/kernel.spec - updated imq patch
arekm
arekm at pld-linux.org
Wed Aug 25 19:02:52 CEST 2010
Author: arekm Date: Wed Aug 25 17:02:52 2010 GMT
Module: packages Tag: HEAD
---- Log message:
- updated imq patch
---- Files affected:
packages/kernel:
kernel-imq.patch (1.10 -> 1.11) , kernel.spec (1.810 -> 1.811)
---- Diffs:
================================================================
Index: packages/kernel/kernel-imq.patch
diff -u packages/kernel/kernel-imq.patch:1.10 packages/kernel/kernel-imq.patch:1.11
--- packages/kernel/kernel-imq.patch:1.10 Thu Aug 5 21:52:26 2010
+++ packages/kernel/kernel-imq.patch Wed Aug 25 19:02:44 2010
@@ -1,7 +1,7 @@
-diff -uNr linux-2.6.34/drivers/net/imq.c linux-2.6.34-imq/drivers/net/imq.c
---- linux-2.6.34/drivers/net/imq.c 1970-01-01 02:00:00.000000000 +0200
-+++ linux-2.6.34-imq/drivers/net/imq.c 2010-06-02 10:05:45.752109073 +0300
-@@ -0,0 +1,635 @@
+diff -uNr linux-2.6.35/drivers/net/imq.c linux-2.6.35-imq-multiqueue-test1/drivers/net/imq.c
+--- linux-2.6.35/drivers/net/imq.c 1970-01-01 02:00:00.000000000 +0200
++++ linux-2.6.35-imq-multiqueue-test1/drivers/net/imq.c 2010-08-15 13:54:30.070063067 +0300
+@@ -0,0 +1,774 @@
+/*
+ * Pseudo-driver for the intermediate queue device.
+ *
@@ -51,7 +51,7 @@
+ * I didn't forget anybody). I apologize again for my lack of time.
+ *
+ *
-+ * 2008/06/17 - 2.6.25 - Changed imq.c to use qdisc_run() instead
++ * 2008/06/17 - 2.6.25 - Changed imq.c to use qdisc_run() instead
+ * of qdisc_restart() and moved qdisc_run() to tasklet to avoid
+ * recursive locking. New initialization routines to fix 'rmmod' not
+ * working anymore. Used code from ifb.c. (Jussi Kivilinna)
@@ -86,6 +86,22 @@
+ * 2010/02/25 - (Jussi Kivilinna)
+ * - Port to 2.6.33
+ *
++ * 2010/08/15 - (Jussi Kivilinna)
++ * - Port to 2.6.35
++ * - Simplify hook registration by using nf_register_hooks.
++ * - nf_reinject doesn't need spinlock around it, therefore remove
++ * imq_nf_reinject function. Other nf_reinject users protect
++ * their own data with spinlock. With IMQ however all data is
++ * needed is stored per skbuff, so no locking is needed.
++ * - Changed IMQ to use 'separate' NF_IMQ_QUEUE instead of
++ * NF_QUEUE, this allows working coexistance of IMQ and other
++ * NF_QUEUE users.
++ * - Make IMQ multi-queue. Number of IMQ device queues can be
++ * increased with 'numqueues' module parameters. Default number
++ * of queues is 1, in other words by default IMQ works as
++ * single-queue device. Multi-queue selection is based on
++ * IFB multi-queue patch by Changli Gao <xiaosuo at gmail.com>.
++ *
+ * Also, many thanks to pablo Sebastian Greco for making the initial
+ * patch and to those who helped the testing.
+ *
@@ -109,66 +125,81 @@
+#include <linux/imq.h>
+#include <net/pkt_sched.h>
+#include <net/netfilter/nf_queue.h>
++#include <net/sock.h>
++#include <linux/ip.h>
++#include <linux/ipv6.h>
++#include <linux/if_vlan.h>
++#include <linux/if_pppox.h>
++#include <net/ip.h>
++#include <net/ipv6.h>
++
++static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num);
+
+static nf_hookfn imq_nf_hook;
+
-+static struct nf_hook_ops imq_ingress_ipv4 = {
-+ .hook = imq_nf_hook,
-+ .owner = THIS_MODULE,
-+ .pf = PF_INET,
-+ .hooknum = NF_INET_PRE_ROUTING,
++static struct nf_hook_ops imq_ops[] = {
++ {
++ /* imq_ingress_ipv4 */
++ .hook = imq_nf_hook,
++ .owner = THIS_MODULE,
++ .pf = PF_INET,
++ .hooknum = NF_INET_PRE_ROUTING,
+#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB)
-+ .priority = NF_IP_PRI_MANGLE + 1
++ .priority = NF_IP_PRI_MANGLE + 1,
+#else
-+ .priority = NF_IP_PRI_NAT_DST + 1
++ .priority = NF_IP_PRI_NAT_DST + 1,
+#endif
-+};
-+
-+static struct nf_hook_ops imq_egress_ipv4 = {
-+ .hook = imq_nf_hook,
-+ .owner = THIS_MODULE,
-+ .pf = PF_INET,
-+ .hooknum = NF_INET_POST_ROUTING,
++ },
++ {
++ /* imq_egress_ipv4 */
++ .hook = imq_nf_hook,
++ .owner = THIS_MODULE,
++ .pf = PF_INET,
++ .hooknum = NF_INET_POST_ROUTING,
+#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA)
-+ .priority = NF_IP_PRI_LAST
++ .priority = NF_IP_PRI_LAST,
+#else
-+ .priority = NF_IP_PRI_NAT_SRC - 1
++ .priority = NF_IP_PRI_NAT_SRC - 1,
+#endif
-+};
-+
++ },
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-+static struct nf_hook_ops imq_ingress_ipv6 = {
-+ .hook = imq_nf_hook,
-+ .owner = THIS_MODULE,
-+ .pf = PF_INET6,
-+ .hooknum = NF_INET_PRE_ROUTING,
++ {
++ /* imq_ingress_ipv6 */
++ .hook = imq_nf_hook,
++ .owner = THIS_MODULE,
++ .pf = PF_INET6,
++ .hooknum = NF_INET_PRE_ROUTING,
+#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB)
-+ .priority = NF_IP6_PRI_MANGLE + 1
++ .priority = NF_IP6_PRI_MANGLE + 1,
+#else
-+ .priority = NF_IP6_PRI_NAT_DST + 1
++ .priority = NF_IP6_PRI_NAT_DST + 1,
+#endif
-+};
-+
-+static struct nf_hook_ops imq_egress_ipv6 = {
-+ .hook = imq_nf_hook,
-+ .owner = THIS_MODULE,
-+ .pf = PF_INET6,
-+ .hooknum = NF_INET_POST_ROUTING,
++ },
++ {
++ /* imq_egress_ipv6 */
++ .hook = imq_nf_hook,
++ .owner = THIS_MODULE,
++ .pf = PF_INET6,
++ .hooknum = NF_INET_POST_ROUTING,
+#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA)
-+ .priority = NF_IP6_PRI_LAST
++ .priority = NF_IP6_PRI_LAST,
+#else
-+ .priority = NF_IP6_PRI_NAT_SRC - 1
++ .priority = NF_IP6_PRI_NAT_SRC - 1,
+#endif
-+};
++ },
+#endif
++};
+
+#if defined(CONFIG_IMQ_NUM_DEVS)
-+static unsigned int numdevs = CONFIG_IMQ_NUM_DEVS;
++static int numdevs = CONFIG_IMQ_NUM_DEVS;
+#else
-+static unsigned int numdevs = IMQ_MAX_DEVS;
++static int numdevs = IMQ_MAX_DEVS;
+#endif
+
-+static DEFINE_SPINLOCK(imq_nf_queue_lock);
++#define IMQ_MAX_QUEUES 32
++static int numqueues = 1;
++
++/*static DEFINE_SPINLOCK(imq_nf_queue_lock);*/
+
+static struct net_device *imq_devs_cache[IMQ_MAX_DEVS];
+
@@ -193,49 +224,6 @@
+ skb_restore_cb(skb); /* kfree backup */
+}
+
-+/* locking not needed when called from imq_nf_queue */
-+static void imq_nf_reinject_lockless(struct nf_queue_entry *entry,
-+ unsigned int verdict)
-+{
-+ int status;
-+
-+ if (!entry->next_outfn) {
-+ nf_reinject(entry, verdict);
-+ return;
-+ }
-+
-+ status = entry->next_outfn(entry, entry->next_queuenum);
-+ if (status < 0) {
-+ nf_queue_entry_release_refs(entry);
-+ kfree_skb(entry->skb);
-+ kfree(entry);
-+ }
-+}
-+
-+static void imq_nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
-+{
-+ int status;
-+
-+ if (!entry->next_outfn) {
-+ spin_lock_bh(&imq_nf_queue_lock);
-+ nf_reinject(entry, verdict);
-+ spin_unlock_bh(&imq_nf_queue_lock);
-+ return;
-+ }
-+
-+ rcu_read_lock();
-+ local_bh_disable();
-+ status = entry->next_outfn(entry, entry->next_queuenum);
-+ local_bh_enable();
-+ if (status < 0) {
-+ nf_queue_entry_release_refs(entry);
-+ kfree_skb(entry->skb);
-+ kfree(entry);
-+ }
-+
-+ rcu_read_unlock();
-+}
-+
+static netdev_tx_t imq_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct nf_queue_entry *entry = skb->nf_queue_entry;
@@ -275,17 +263,184 @@
+ skb->imq_flags = 0;
+ skb->destructor = NULL;
+
-+ imq_nf_reinject(entry, NF_ACCEPT);
++ nf_reinject(entry, NF_ACCEPT);
+
+ return NETDEV_TX_OK;
+}
+
++static u32 imq_hashrnd;
++
++static inline __be16 pppoe_proto(const struct sk_buff *skb)
++{
++ return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
++ sizeof(struct pppoe_hdr)));
++}
++
++static u16 imq_hash(struct net_device *dev, struct sk_buff *skb)
++{
++ unsigned int pull_len;
++ u16 protocol = skb->protocol;
++ u32 addr1, addr2;
++ u32 hash, ihl = 0;
++ union {
++ u16 in16[2];
++ u32 in32;
++ } ports;
++ u8 ip_proto;
++
++ pull_len = 0;
++
++recheck:
++ switch (protocol) {
++ case htons(ETH_P_8021Q): {
++ if (unlikely(skb_pull(skb, VLAN_HLEN) == NULL))
++ goto other;
++
++ pull_len += VLAN_HLEN;
++ skb->network_header += VLAN_HLEN;
++
++ protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
++ goto recheck;
++ }
++
++ case htons(ETH_P_PPP_SES): {
++ if (unlikely(skb_pull(skb, PPPOE_SES_HLEN) == NULL))
++ goto other;
++
++ pull_len += PPPOE_SES_HLEN;
++ skb->network_header += PPPOE_SES_HLEN;
++
++ protocol = pppoe_proto(skb);
++ goto recheck;
++ }
++
++ case htons(ETH_P_IP): {
++ const struct iphdr *iph = ip_hdr(skb);
++
++ if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr))))
++ goto other;
++
++ addr1 = iph->daddr;
++ addr2 = iph->saddr;
++
++ ip_proto = !(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) ?
++ iph->protocol : 0;
++ ihl = ip_hdrlen(skb);
++
++ break;
++ }
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ case htons(ETH_P_IPV6): {
++ const struct ipv6hdr *iph = ipv6_hdr(skb);
++
++ if (unlikely(!pskb_may_pull(skb, sizeof(struct ipv6hdr))))
++ goto other;
++
++ addr1 = iph->daddr.s6_addr32[3];
++ addr2 = iph->saddr.s6_addr32[3];
++ ihl = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &ip_proto);
++ if (unlikely(ihl < 0))
++ goto other;
++
++ break;
++ }
++#endif
++ default:
++other:
++ if (pull_len != 0) {
++ skb_push(skb, pull_len);
++ skb->network_header -= pull_len;
++ }
++
++ return (u16)(ntohs(protocol) % dev->real_num_tx_queues);
++ }
++
++ if (addr1 > addr2)
++ swap(addr1, addr2);
++
++ switch (ip_proto) {
++ case IPPROTO_TCP:
++ case IPPROTO_UDP:
++ case IPPROTO_DCCP:
++ case IPPROTO_ESP:
++ case IPPROTO_AH:
++ case IPPROTO_SCTP:
++ case IPPROTO_UDPLITE: {
++ if (likely(skb_copy_bits(skb, ihl, &ports.in32, 4) >= 0)) {
++ if (ports.in16[0] > ports.in16[1])
++ swap(ports.in16[0], ports.in16[1]);
++ break;
++ }
++ /* fall-through */
++ }
++ default:
++ ports.in32 = 0;
++ break;
++ }
++
++ if (pull_len != 0) {
++ skb_push(skb, pull_len);
++ skb->network_header -= pull_len;
++ }
++
++ hash = jhash_3words(addr1, addr2, ports.in32, imq_hashrnd ^ ip_proto);
++
++ return (u16)(((u64)hash * dev->real_num_tx_queues) >> 32);
++}
++
++static inline bool sk_tx_queue_recorded(struct sock *sk)
++{
++ return (sk_tx_queue_get(sk) >= 0);
++}
++
++static struct netdev_queue *imq_select_queue(struct net_device *dev,
++ struct sk_buff *skb)
++{
++ u16 queue_index = 0;
++ u32 hash;
++
++ if (likely(dev->real_num_tx_queues == 1))
++ goto out;
++
++ /* IMQ can be receiving ingress or engress packets. */
++
++ /* Check first for if rx_queue is set */
++ if (skb_rx_queue_recorded(skb)) {
++ queue_index = skb_get_rx_queue(skb);
++ goto out;
++ }
++
++ /* Check if socket has tx_queue set */
++ if (sk_tx_queue_recorded(skb->sk)) {
++ queue_index = sk_tx_queue_get(skb->sk);
++ goto out;
++ }
++
++ /* Try use socket hash */
++ if (skb->sk && skb->sk->sk_hash) {
++ hash = skb->sk->sk_hash;
++ queue_index =
++ (u16)(((u64)hash * dev->real_num_tx_queues) >> 32);
++ goto out;
++ }
++
++ /* Generate hash from packet data */
++ queue_index = imq_hash(dev, skb);
++
++out:
++ if (unlikely(queue_index >= dev->real_num_tx_queues))
++ queue_index = (u16)((u32)queue_index % dev->real_num_tx_queues);
++
++ return netdev_get_tx_queue(dev, queue_index);
++}
++
+static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num)
+{
+ struct net_device *dev;
+ struct sk_buff *skb_orig, *skb, *skb_shared;
+ struct Qdisc *q;
+ struct netdev_queue *txq;
++ spinlock_t *root_lock;
+ int users, index;
+ int retval = -EINVAL;
+
@@ -307,7 +462,7 @@
+ /* get device by name and cache result */
+ snprintf(buf, sizeof(buf), "imq%d", index);
+ dev = dev_get_by_name(&init_net, buf);
-+ if (!dev) {
++ if (unlikely(!dev)) {
+ /* not found ?!*/
+ BUG();
+ retval = -ENODEV;
@@ -320,7 +475,7 @@
+
+ if (unlikely(!(dev->flags & IFF_UP))) {
+ entry->skb->imq_flags = 0;
-+ imq_nf_reinject_lockless(entry, NF_ACCEPT);
++ nf_reinject(entry, NF_ACCEPT);
+ retval = 0;
+ goto out;
+ }
@@ -333,7 +488,7 @@
+ if (unlikely(skb->destructor)) {
+ skb_orig = skb;
+ skb = skb_clone(skb, GFP_ATOMIC);
-+ if (!skb) {
++ if (unlikely(!skb)) {
+ retval = -ENOMEM;
+ goto out;
+ }
@@ -345,13 +500,18 @@
+ dev->stats.rx_bytes += skb->len;
+ dev->stats.rx_packets++;
+
-+ txq = dev_pick_tx(dev, skb);
++ /* Disables softirqs for lock below */
++ rcu_read_lock_bh();
++
++ /* Multi-queue selection */
++ txq = imq_select_queue(dev, skb);
+
+ q = rcu_dereference(txq->qdisc);
+ if (unlikely(!q->enqueue))
+ goto packet_not_eaten_by_imq_dev;
+
-+ spin_lock_bh(qdisc_lock(q));
++ root_lock = qdisc_lock(q);
++ spin_lock(root_lock);
+
+ users = atomic_read(&skb->users);
+
@@ -366,10 +526,11 @@
+ skb->destructor = &imq_skb_destructor;
+
+ /* cloned? */
-+ if (skb_orig)
++ if (unlikely(skb_orig))
+ kfree_skb(skb_orig); /* free original */
+
-+ spin_unlock_bh(qdisc_lock(q));
++ spin_unlock(root_lock);
++ rcu_read_unlock_bh();
+
+ /* schedule qdisc dequeue */
+ __netif_schedule(q);
@@ -382,13 +543,15 @@
+ /* qdisc dropped packet and decreased skb reference count of
+ * skb, so we don't really want to and try refree as that would
+ * actually destroy the skb. */
-+ spin_unlock_bh(qdisc_lock(q));
++ spin_unlock(root_lock);
+ goto packet_not_eaten_by_imq_dev;
+ }
+
+packet_not_eaten_by_imq_dev:
++ rcu_read_unlock_bh();
++
+ /* cloned? restore original */
-+ if (skb_orig) {
++ if (unlikely(skb_orig)) {
+ kfree_skb(skb);
+ entry->skb = skb_orig;
+ }
@@ -397,20 +560,12 @@
+ return retval;
+}
+
-+static struct nf_queue_handler nfqh = {
-+ .name = "imq",
-+ .outfn = imq_nf_queue,
-+};
-+
+static unsigned int imq_nf_hook(unsigned int hook, struct sk_buff *pskb,
+ const struct net_device *indev,
+ const struct net_device *outdev,
+ int (*okfn)(struct sk_buff *))
+{
-+ if (pskb->imq_flags & IMQ_F_ENQUEUE)
-+ return NF_QUEUE;
-+
-+ return NF_ACCEPT;
++ return (pskb->imq_flags & IMQ_F_ENQUEUE) ? NF_IMQ_QUEUE : NF_ACCEPT;
+}
+
+static int imq_close(struct net_device *dev)
@@ -472,43 +627,22 @@
+ .validate = imq_validate,
+};
+
++static const struct nf_queue_handler imq_nfqh = {
++ .name = "imq",
++ .outfn = imq_nf_queue,
++};
++
+static int __init imq_init_hooks(void)
+{
-+ int err;
-+
-+ nf_register_queue_imq_handler(&nfqh);
-+
-+ err = nf_register_hook(&imq_ingress_ipv4);
-+ if (err)
-+ goto err1;
-+
-+ err = nf_register_hook(&imq_egress_ipv4);
-+ if (err)
-+ goto err2;
++ int ret;
+
-+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-+ err = nf_register_hook(&imq_ingress_ipv6);
-+ if (err)
-+ goto err3;
-+
-+ err = nf_register_hook(&imq_egress_ipv6);
-+ if (err)
-+ goto err4;
-+#endif
++ nf_register_queue_imq_handler(&imq_nfqh);
+
-+ return 0;
++ ret = nf_register_hooks(imq_ops, ARRAY_SIZE(imq_ops));
++ if (ret < 0)
++ nf_unregister_queue_imq_handler();
+
-+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-+err4:
-+ nf_unregister_hook(&imq_ingress_ipv6);
-+err3:
-+ nf_unregister_hook(&imq_egress_ipv4);
-+#endif
-+err2:
-+ nf_unregister_hook(&imq_ingress_ipv4);
-+err1:
-+ nf_unregister_queue_imq_handler();
-+ return err;
++ return ret;
+}
+
+static int __init imq_init_one(int index)
@@ -516,7 +650,7 @@
+ struct net_device *dev;
+ int ret;
+
-+ dev = alloc_netdev(0, "imq%d", imq_setup);
++ dev = alloc_netdev_mq(0, "imq%d", imq_setup, numqueues);
+ if (!dev)
+ return -ENOMEM;
+
@@ -545,6 +679,14 @@
+ return -EINVAL;
+ }
+
++ if (numqueues < 1 || numqueues > IMQ_MAX_QUEUES) {
++ printk(KERN_ERR "IMQ: numqueues has to be betweed 1 and %u\n",
++ IMQ_MAX_QUEUES);
++ return -EINVAL;
++ }
++
++ get_random_bytes(&imq_hashrnd, sizeof(imq_hashrnd));
++
+ rtnl_lock();
+ err = __rtnl_link_register(&imq_link_ops);
+
@@ -584,7 +726,8 @@
+ return err;
+ }
+
-+ printk(KERN_INFO "IMQ driver loaded successfully.\n");
++ printk(KERN_INFO "IMQ driver loaded successfully. "
++ "(numdevs = %d, numqueues = %d)\n", numdevs, numqueues);
+
+#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB)
+ printk(KERN_INFO "\tHooking IMQ before NAT on PREROUTING.\n");
@@ -602,13 +745,7 @@
<<Diff was trimmed, longer than 597 lines>>
---- CVS-web:
http://cvs.pld-linux.org/cgi-bin/cvsweb.cgi/packages/kernel/kernel-imq.patch?r1=1.10&r2=1.11&f=u
http://cvs.pld-linux.org/cgi-bin/cvsweb.cgi/packages/kernel/kernel.spec?r1=1.810&r2=1.811&f=u
More information about the pld-cvs-commit
mailing list