@@ -81,6 +81,15 @@ fib_multipath_hash_policy - INTEGER
0 - Layer 3
1 - Layer 4
+ip_forward_update_priority - INTEGER
+ Whether to update SKB priority from "TOS" field in IPv4 header after it
+ is forwarded. The new SKB priority is mapped from TOS field value
+ according to an rt_tos2priority table (see e.g. man tc-prio).
+ Default: 1 (Update priority.)
+ Possible values:
+ 0 - Do not update priority.
+ 1 - Update priority.
+
route/max_size - INTEGER
Maximum number of routes allowed in the kernel. Increase
this when using large numbers of interfaces and/or routes.
@@ -28,6 +28,7 @@ enum netevent_notif_type {
NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */
NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */
NETEVENT_IPV6_MPATH_HASH_UPDATE, /* arg is struct net ptr */
+ NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, /* arg is struct net ptr */
};
int register_netevent_notifier(struct notifier_block *nb);
@@ -98,6 +98,7 @@ struct netns_ipv4 {
int sysctl_ip_default_ttl;
int sysctl_ip_no_pmtu_disc;
int sysctl_ip_fwd_use_pmtu;
+ int sysctl_ip_fwd_update_priority;
int sysctl_ip_nonlocal_bind;
/* Shall we try to damage output packets if routing dev changes? */
int sysctl_ip_dynaddr;
@@ -1802,6 +1802,7 @@ static __net_init int inet_init_net(struct net *net)
* We set them here, in case sysctl is not compiled.
*/
net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
+ net->ipv4.sysctl_ip_fwd_update_priority = true;
net->ipv4.sysctl_ip_dynaddr = 0;
net->ipv4.sysctl_ip_early_demux = 1;
net->ipv4.sysctl_udp_early_demux = 1;
@@ -143,7 +143,8 @@ int ip_forward(struct sk_buff *skb)
!skb_sec_path(skb))
ip_rt_send_redirect(skb);
- skb->priority = rt_tos2priority(iph->tos);
+ if (net->ipv4.sysctl_ip_fwd_update_priority)
+ skb->priority = rt_tos2priority(iph->tos);
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
net, NULL, skb, skb->dev, rt->dst.dev,
@@ -201,6 +201,23 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
return ret;
}
+static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct net *net;
+ int ret;
+
+ net = container_of(table->data, struct net,
+ ipv4.sysctl_ip_fwd_update_priority);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ call_netevent_notifiers(
+ NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, net);
+
+ return ret;
+}
+
static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -664,6 +681,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "ip_forward_update_priority",
+ .data = &init_net.ipv4.sysctl_ip_fwd_update_priority,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ipv4_fwd_update_priority,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
.procname = "ip_nonlocal_bind",
.data = &init_net.ipv4.sysctl_ip_nonlocal_bind,
.maxlen = sizeof(int),
After IPv4 packets are forwarded, the priority of the corresponding SKB is updated according to the TOS field of IPv4 header. This overrides any prioritization done earlier by e.g. an skbedit action or ingress-qos-map defined at a vlan device. Such overriding may not always be desirable. Even if the packet ends up being routed, which implies this is an L3 network node, an administrator may wish to preserve whatever prioritization was done earlier on in the pipeline. Therefore introduce a sysctl that controls this behavior. Keep the default value at 1 to maintain backward-compatible behavior. Signed-off-by: Petr Machata <petrm@mellanox.com> --- Documentation/networking/ip-sysctl.txt | 9 +++++++++ include/net/netevent.h | 1 + include/net/netns/ipv4.h | 1 + net/ipv4/af_inet.c | 1 + net/ipv4/ip_forward.c | 3 ++- net/ipv4/sysctl_net_ipv4.c | 26 ++++++++++++++++++++++++++ 6 files changed, 40 insertions(+), 1 deletion(-)