packages: kernel/kernel-imq.patch, kernel/kernel.spec - updated imq patch

arekm arekm at pld-linux.org
Wed Aug 25 19:02:52 CEST 2010


Author: arekm                        Date: Wed Aug 25 17:02:52 2010 GMT
Module: packages                      Tag: HEAD
---- Log message:
- updated imq patch

---- Files affected:
packages/kernel:
   kernel-imq.patch (1.10 -> 1.11) , kernel.spec (1.810 -> 1.811) 

---- Diffs:

================================================================
Index: packages/kernel/kernel-imq.patch
diff -u packages/kernel/kernel-imq.patch:1.10 packages/kernel/kernel-imq.patch:1.11
--- packages/kernel/kernel-imq.patch:1.10	Thu Aug  5 21:52:26 2010
+++ packages/kernel/kernel-imq.patch	Wed Aug 25 19:02:44 2010
@@ -1,7 +1,7 @@
-diff -uNr linux-2.6.34/drivers/net/imq.c linux-2.6.34-imq/drivers/net/imq.c
---- linux-2.6.34/drivers/net/imq.c	1970-01-01 02:00:00.000000000 +0200
-+++ linux-2.6.34-imq/drivers/net/imq.c	2010-06-02 10:05:45.752109073 +0300
-@@ -0,0 +1,635 @@
+diff -uNr linux-2.6.35/drivers/net/imq.c linux-2.6.35-imq-multiqueue-test1/drivers/net/imq.c
+--- linux-2.6.35/drivers/net/imq.c	1970-01-01 02:00:00.000000000 +0200
++++ linux-2.6.35-imq-multiqueue-test1/drivers/net/imq.c	2010-08-15 13:54:30.070063067 +0300
+@@ -0,0 +1,774 @@
 +/*
 + *             Pseudo-driver for the intermediate queue device.
 + *
@@ -51,7 +51,7 @@
 + *             I didn't forget anybody). I apologize again for my lack of time.
 + *
 + *
-+ *             2008/06/17 - 2.6.25 - Changed imq.c to use qdisc_run() instead 
++ *             2008/06/17 - 2.6.25 - Changed imq.c to use qdisc_run() instead
 + *             of qdisc_restart() and moved qdisc_run() to tasklet to avoid
 + *             recursive locking. New initialization routines to fix 'rmmod' not
 + *             working anymore. Used code from ifb.c. (Jussi Kivilinna)
@@ -86,6 +86,22 @@
 + *             2010/02/25 - (Jussi Kivilinna)
 + *              - Port to 2.6.33
 + *
++ *             2010/08/15 - (Jussi Kivilinna)
++ *              - Port to 2.6.35
++ *              - Simplify hook registration by using nf_register_hooks.
++ *              - nf_reinject doesn't need spinlock around it, therefore remove
++ *                imq_nf_reinject function. Other nf_reinject users protect
++ *                their own data with spinlock. With IMQ however all data is
++ *                needed is stored per skbuff, so no locking is needed.
++ *              - Changed IMQ to use 'separate' NF_IMQ_QUEUE instead of
++ *                NF_QUEUE, this allows working coexistance of IMQ and other
++ *                NF_QUEUE users.
++ *              - Make IMQ multi-queue. Number of IMQ device queues can be
++ *                increased with 'numqueues' module parameters. Default number
++ *                of queues is 1, in other words by default IMQ works as
++ *                single-queue device. Multi-queue selection is based on 
++ *                IFB multi-queue patch by Changli Gao <xiaosuo at gmail.com>.
++ *
 + *	       Also, many thanks to pablo Sebastian Greco for making the initial
 + *	       patch and to those who helped the testing.
 + *
@@ -109,66 +125,81 @@
 +#include <linux/imq.h>
 +#include <net/pkt_sched.h>
 +#include <net/netfilter/nf_queue.h>
++#include <net/sock.h>
++#include <linux/ip.h>
++#include <linux/ipv6.h>
++#include <linux/if_vlan.h>
++#include <linux/if_pppox.h>
++#include <net/ip.h>
++#include <net/ipv6.h>
++
++static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num);
 +
 +static nf_hookfn imq_nf_hook;
 +
-+static struct nf_hook_ops imq_ingress_ipv4 = {
-+	.hook		= imq_nf_hook,
-+	.owner		= THIS_MODULE,
-+	.pf		= PF_INET,
-+	.hooknum	= NF_INET_PRE_ROUTING,
++static struct nf_hook_ops imq_ops[] = {
++	{
++	/* imq_ingress_ipv4 */
++		.hook		= imq_nf_hook,
++		.owner		= THIS_MODULE,
++		.pf		= PF_INET,
++		.hooknum	= NF_INET_PRE_ROUTING,
 +#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB)
-+	.priority	= NF_IP_PRI_MANGLE + 1
++		.priority	= NF_IP_PRI_MANGLE + 1,
 +#else
-+	.priority	= NF_IP_PRI_NAT_DST + 1
++		.priority	= NF_IP_PRI_NAT_DST + 1,
 +#endif
-+};
-+
-+static struct nf_hook_ops imq_egress_ipv4 = {
-+	.hook		= imq_nf_hook,
-+	.owner		= THIS_MODULE,
-+	.pf		= PF_INET,
-+	.hooknum	= NF_INET_POST_ROUTING,
++	},
++	{
++	/* imq_egress_ipv4 */
++		.hook		= imq_nf_hook,
++		.owner		= THIS_MODULE,
++		.pf		= PF_INET,
++		.hooknum	= NF_INET_POST_ROUTING,
 +#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA)
-+	.priority	= NF_IP_PRI_LAST
++		.priority	= NF_IP_PRI_LAST,
 +#else
-+	.priority	= NF_IP_PRI_NAT_SRC - 1
++		.priority	= NF_IP_PRI_NAT_SRC - 1,
 +#endif
-+};
-+
++	},
 +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-+static struct nf_hook_ops imq_ingress_ipv6 = {
-+	.hook		= imq_nf_hook,
-+	.owner		= THIS_MODULE,
-+	.pf		= PF_INET6,
-+	.hooknum	= NF_INET_PRE_ROUTING,
++	{
++	/* imq_ingress_ipv6 */
++		.hook		= imq_nf_hook,
++		.owner		= THIS_MODULE,
++		.pf		= PF_INET6,
++		.hooknum	= NF_INET_PRE_ROUTING,
 +#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB)
-+	.priority	= NF_IP6_PRI_MANGLE + 1
++		.priority	= NF_IP6_PRI_MANGLE + 1,
 +#else
-+	.priority	= NF_IP6_PRI_NAT_DST + 1
++		.priority	= NF_IP6_PRI_NAT_DST + 1,
 +#endif
-+};
-+
-+static struct nf_hook_ops imq_egress_ipv6 = {
-+	.hook		= imq_nf_hook,
-+	.owner		= THIS_MODULE,
-+	.pf		= PF_INET6,
-+	.hooknum	= NF_INET_POST_ROUTING,
++	},
++	{
++	/* imq_egress_ipv6 */
++		.hook		= imq_nf_hook,
++		.owner		= THIS_MODULE,
++		.pf		= PF_INET6,
++		.hooknum	= NF_INET_POST_ROUTING,
 +#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA)
-+	.priority	= NF_IP6_PRI_LAST
++		.priority	= NF_IP6_PRI_LAST,
 +#else
-+	.priority	= NF_IP6_PRI_NAT_SRC - 1
++		.priority	= NF_IP6_PRI_NAT_SRC - 1,
 +#endif
-+};
++	},
 +#endif
++};
 +
 +#if defined(CONFIG_IMQ_NUM_DEVS)
-+static unsigned int numdevs = CONFIG_IMQ_NUM_DEVS;
++static int numdevs = CONFIG_IMQ_NUM_DEVS;
 +#else
-+static unsigned int numdevs = IMQ_MAX_DEVS;
++static int numdevs = IMQ_MAX_DEVS;
 +#endif
 +
-+static DEFINE_SPINLOCK(imq_nf_queue_lock);
++#define IMQ_MAX_QUEUES 32
++static int numqueues = 1;
++
++/*static DEFINE_SPINLOCK(imq_nf_queue_lock);*/
 +
 +static struct net_device *imq_devs_cache[IMQ_MAX_DEVS];
 +
@@ -193,49 +224,6 @@
 +	skb_restore_cb(skb); /* kfree backup */
 +}
 +
-+/* locking not needed when called from imq_nf_queue */
-+static void imq_nf_reinject_lockless(struct nf_queue_entry *entry,
-+						unsigned int verdict)
-+{
-+	int status;
-+
-+	if (!entry->next_outfn) {
-+		nf_reinject(entry, verdict);
-+		return;
-+	}
-+
-+	status = entry->next_outfn(entry, entry->next_queuenum);
-+	if (status < 0) {
-+		nf_queue_entry_release_refs(entry);
-+		kfree_skb(entry->skb);
-+		kfree(entry);
-+	}
-+}
-+
-+static void imq_nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
-+{
-+	int status;
-+
-+	if (!entry->next_outfn) {
-+		spin_lock_bh(&imq_nf_queue_lock);
-+		nf_reinject(entry, verdict);
-+		spin_unlock_bh(&imq_nf_queue_lock);
-+		return;
-+	}
-+
-+	rcu_read_lock();
-+	local_bh_disable();
-+	status = entry->next_outfn(entry, entry->next_queuenum);
-+	local_bh_enable();
-+	if (status < 0) {
-+		nf_queue_entry_release_refs(entry);
-+		kfree_skb(entry->skb);
-+		kfree(entry);
-+	}
-+
-+	rcu_read_unlock();
-+}
-+
 +static netdev_tx_t imq_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 +{
 +	struct nf_queue_entry *entry = skb->nf_queue_entry;
@@ -275,17 +263,184 @@
 +	skb->imq_flags = 0;
 +	skb->destructor = NULL;
 +
-+	imq_nf_reinject(entry, NF_ACCEPT);
++	nf_reinject(entry, NF_ACCEPT);
 +
 +	return NETDEV_TX_OK;
 +}
 +
++static u32 imq_hashrnd;
++
++static inline __be16 pppoe_proto(const struct sk_buff *skb)
++{
++	return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
++			sizeof(struct pppoe_hdr)));
++}
++
++static u16 imq_hash(struct net_device *dev, struct sk_buff *skb)
++{
++	unsigned int pull_len;
++	u16 protocol = skb->protocol;
++	u32 addr1, addr2;
++	u32 hash, ihl = 0;
++	union {
++		u16 in16[2];
++		u32 in32;
++	} ports;
++	u8 ip_proto;
++
++	pull_len = 0;
++
++recheck:
++	switch (protocol) {
++	case htons(ETH_P_8021Q): {
++		if (unlikely(skb_pull(skb, VLAN_HLEN) == NULL))
++			goto other;
++
++		pull_len += VLAN_HLEN;
++		skb->network_header += VLAN_HLEN;
++
++		protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
++		goto recheck;
++	}
++
++	case htons(ETH_P_PPP_SES): {
++		if (unlikely(skb_pull(skb, PPPOE_SES_HLEN) == NULL))
++			goto other;
++
++		pull_len += PPPOE_SES_HLEN;
++		skb->network_header += PPPOE_SES_HLEN;
++
++		protocol = pppoe_proto(skb);
++		goto recheck;
++	}
++
++	case htons(ETH_P_IP): {
++		const struct iphdr *iph = ip_hdr(skb);
++
++		if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr))))
++			goto other;
++
++		addr1 = iph->daddr;
++		addr2 = iph->saddr;
++
++		ip_proto = !(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) ?
++				 iph->protocol : 0;
++		ihl = ip_hdrlen(skb);
++
++		break;
++	}
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++	case htons(ETH_P_IPV6): {
++		const struct ipv6hdr *iph = ipv6_hdr(skb);
++
++		if (unlikely(!pskb_may_pull(skb, sizeof(struct ipv6hdr))))
++			goto other;
++
++		addr1 = iph->daddr.s6_addr32[3];
++		addr2 = iph->saddr.s6_addr32[3];
++		ihl = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &ip_proto);
++		if (unlikely(ihl < 0))
++			goto other;
++
++		break;
++	}
++#endif
++	default:
++other:
++		if (pull_len != 0) {
++			skb_push(skb, pull_len);
++			skb->network_header -= pull_len;
++		}
++
++		return (u16)(ntohs(protocol) % dev->real_num_tx_queues);
++	}
++
++	if (addr1 > addr2)
++		swap(addr1, addr2);
++
++	switch (ip_proto) {
++	case IPPROTO_TCP:
++	case IPPROTO_UDP:
++	case IPPROTO_DCCP:
++	case IPPROTO_ESP:
++	case IPPROTO_AH:
++	case IPPROTO_SCTP:
++	case IPPROTO_UDPLITE: {
++		if (likely(skb_copy_bits(skb, ihl, &ports.in32, 4) >= 0)) {
++			if (ports.in16[0] > ports.in16[1])
++				swap(ports.in16[0], ports.in16[1]);
++			break;
++		}
++		/* fall-through */
++	}
++	default:
++		ports.in32 = 0;
++		break;
++	}
++
++	if (pull_len != 0) {
++		skb_push(skb, pull_len);
++		skb->network_header -= pull_len;
++	}
++
++	hash = jhash_3words(addr1, addr2, ports.in32, imq_hashrnd ^ ip_proto);
++
++	return (u16)(((u64)hash * dev->real_num_tx_queues) >> 32);
++}
++
++static inline bool sk_tx_queue_recorded(struct sock *sk)
++{
++	return (sk_tx_queue_get(sk) >= 0);
++}
++
++static struct netdev_queue *imq_select_queue(struct net_device *dev,
++						struct sk_buff *skb)
++{
++	u16 queue_index = 0;
++	u32 hash;
++
++	if (likely(dev->real_num_tx_queues == 1))
++		goto out;
++
++	/* IMQ can be receiving ingress or engress packets. */
++
++	/* Check first for if rx_queue is set */
++	if (skb_rx_queue_recorded(skb)) {
++		queue_index = skb_get_rx_queue(skb);
++		goto out;
++	}
++
++	/* Check if socket has tx_queue set */
++	if (sk_tx_queue_recorded(skb->sk)) {
++		queue_index = sk_tx_queue_get(skb->sk);
++		goto out;
++	}
++
++	/* Try use socket hash */
++	if (skb->sk && skb->sk->sk_hash) {
++		hash = skb->sk->sk_hash;
++		queue_index =
++			(u16)(((u64)hash * dev->real_num_tx_queues) >> 32);
++		goto out;
++	}
++
++	/* Generate hash from packet data */
++	queue_index = imq_hash(dev, skb);
++
++out:
++	if (unlikely(queue_index >= dev->real_num_tx_queues))
++		queue_index = (u16)((u32)queue_index % dev->real_num_tx_queues);
++
++	return netdev_get_tx_queue(dev, queue_index);
++}
++
 +static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num)
 +{
 +	struct net_device *dev;
 +	struct sk_buff *skb_orig, *skb, *skb_shared;
 +	struct Qdisc *q;
 +	struct netdev_queue *txq;
++	spinlock_t *root_lock;
 +	int users, index;
 +	int retval = -EINVAL;
 +
@@ -307,7 +462,7 @@
 +		/* get device by name and cache result */
 +		snprintf(buf, sizeof(buf), "imq%d", index);
 +		dev = dev_get_by_name(&init_net, buf);
-+		if (!dev) {
++		if (unlikely(!dev)) {
 +			/* not found ?!*/
 +			BUG();
 +			retval = -ENODEV;
@@ -320,7 +475,7 @@
 +
 +	if (unlikely(!(dev->flags & IFF_UP))) {
 +		entry->skb->imq_flags = 0;
-+		imq_nf_reinject_lockless(entry, NF_ACCEPT);
++		nf_reinject(entry, NF_ACCEPT);
 +		retval = 0;
 +		goto out;
 +	}
@@ -333,7 +488,7 @@
 +	if (unlikely(skb->destructor)) {
 +		skb_orig = skb;
 +		skb = skb_clone(skb, GFP_ATOMIC);
-+		if (!skb) {
++		if (unlikely(!skb)) {
 +			retval = -ENOMEM;
 +			goto out;
 +		}
@@ -345,13 +500,18 @@
 +	dev->stats.rx_bytes += skb->len;
 +	dev->stats.rx_packets++;
 +
-+	txq = dev_pick_tx(dev, skb);
++	/* Disables softirqs for lock below */
++	rcu_read_lock_bh();
++
++	/* Multi-queue selection */
++	txq = imq_select_queue(dev, skb);
 +
 +	q = rcu_dereference(txq->qdisc);
 +	if (unlikely(!q->enqueue))
 +		goto packet_not_eaten_by_imq_dev;
 +
-+	spin_lock_bh(qdisc_lock(q));
++	root_lock = qdisc_lock(q);
++	spin_lock(root_lock);
 +
 +	users = atomic_read(&skb->users);
 +
@@ -366,10 +526,11 @@
 +		skb->destructor = &imq_skb_destructor;
 +
 +		/* cloned? */
-+		if (skb_orig)
++		if (unlikely(skb_orig))
 +			kfree_skb(skb_orig); /* free original */
 +
-+		spin_unlock_bh(qdisc_lock(q));
++		spin_unlock(root_lock);
++		rcu_read_unlock_bh();
 +
 +		/* schedule qdisc dequeue */
 +		__netif_schedule(q);
@@ -382,13 +543,15 @@
 +		/* qdisc dropped packet and decreased skb reference count of
 +		 * skb, so we don't really want to and try refree as that would
 +		 * actually destroy the skb. */
-+		spin_unlock_bh(qdisc_lock(q));
++		spin_unlock(root_lock);
 +		goto packet_not_eaten_by_imq_dev;
 +	}
 +
 +packet_not_eaten_by_imq_dev:
++	rcu_read_unlock_bh();
++
 +	/* cloned? restore original */
-+	if (skb_orig) {
++	if (unlikely(skb_orig)) {
 +		kfree_skb(skb);
 +		entry->skb = skb_orig;
 +	}
@@ -397,20 +560,12 @@
 +	return retval;
 +}
 +
-+static struct nf_queue_handler nfqh = {
-+	.name  = "imq",
-+	.outfn = imq_nf_queue,
-+};
-+
 +static unsigned int imq_nf_hook(unsigned int hook, struct sk_buff *pskb,
 +				const struct net_device *indev,
 +				const struct net_device *outdev,
 +				int (*okfn)(struct sk_buff *))
 +{
-+	if (pskb->imq_flags & IMQ_F_ENQUEUE)
-+		return NF_QUEUE;
-+
-+	return NF_ACCEPT;
++	return (pskb->imq_flags & IMQ_F_ENQUEUE) ? NF_IMQ_QUEUE : NF_ACCEPT;
 +}
 +
 +static int imq_close(struct net_device *dev)
@@ -472,43 +627,22 @@
 +	.validate	= imq_validate,
 +};
 +
++static const struct nf_queue_handler imq_nfqh = {
++	.name  = "imq",
++	.outfn = imq_nf_queue,
++};
++
 +static int __init imq_init_hooks(void)
 +{
-+	int err;
-+
-+	nf_register_queue_imq_handler(&nfqh);
-+
-+	err = nf_register_hook(&imq_ingress_ipv4);
-+	if (err)
-+		goto err1;
-+
-+	err = nf_register_hook(&imq_egress_ipv4);
-+	if (err)
-+		goto err2;
++	int ret;
 +
-+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-+	err = nf_register_hook(&imq_ingress_ipv6);
-+	if (err)
-+		goto err3;
-+
-+	err = nf_register_hook(&imq_egress_ipv6);
-+	if (err)
-+		goto err4;
-+#endif
++	nf_register_queue_imq_handler(&imq_nfqh);
 +
-+	return 0;
++	ret = nf_register_hooks(imq_ops, ARRAY_SIZE(imq_ops));
++	if (ret < 0)
++		nf_unregister_queue_imq_handler();
 +
-+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-+err4:
-+	nf_unregister_hook(&imq_ingress_ipv6);
-+err3:
-+	nf_unregister_hook(&imq_egress_ipv4);
-+#endif
-+err2:
-+	nf_unregister_hook(&imq_ingress_ipv4);
-+err1:
-+	nf_unregister_queue_imq_handler();
-+	return err;
++	return ret;
 +}
 +
 +static int __init imq_init_one(int index)
@@ -516,7 +650,7 @@
 +	struct net_device *dev;
 +	int ret;
 +
-+	dev = alloc_netdev(0, "imq%d", imq_setup);
++	dev = alloc_netdev_mq(0, "imq%d", imq_setup, numqueues);
 +	if (!dev)
 +		return -ENOMEM;
 +
@@ -545,6 +679,14 @@
 +		return -EINVAL;
 +	}
 +
++	if (numqueues < 1 || numqueues > IMQ_MAX_QUEUES) {
++		printk(KERN_ERR "IMQ: numqueues has to be betweed 1 and %u\n",
++		       IMQ_MAX_QUEUES);
++		return -EINVAL;
++	}
++
++	get_random_bytes(&imq_hashrnd, sizeof(imq_hashrnd));
++
 +	rtnl_lock();
 +	err = __rtnl_link_register(&imq_link_ops);
 +
@@ -584,7 +726,8 @@
 +		return err;
 +	}
 +
-+	printk(KERN_INFO "IMQ driver loaded successfully.\n");
++	printk(KERN_INFO "IMQ driver loaded successfully. "
++		"(numdevs = %d, numqueues = %d)\n", numdevs, numqueues);
 +
 +#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB)
 +	printk(KERN_INFO "\tHooking IMQ before NAT on PREROUTING.\n");
@@ -602,13 +745,7 @@
<<Diff was trimmed, longer than 597 lines>>

---- CVS-web:
    http://cvs.pld-linux.org/cgi-bin/cvsweb.cgi/packages/kernel/kernel-imq.patch?r1=1.10&r2=1.11&f=u
    http://cvs.pld-linux.org/cgi-bin/cvsweb.cgi/packages/kernel/kernel.spec?r1=1.810&r2=1.811&f=u



More information about the pld-cvs-commit mailing list