12874c5fdSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds * INET An implementation of the TCP/IP protocol suite for the LINUX
41da177e4SLinus Torvalds * operating system. INET is implemented using the BSD Socket
51da177e4SLinus Torvalds * interface as the means of communication with the user level.
61da177e4SLinus Torvalds *
71da177e4SLinus Torvalds * The Internet Protocol (IP) module.
81da177e4SLinus Torvalds *
902c30a84SJesper Juhl * Authors: Ross Biro
101da177e4SLinus Torvalds * Fred N. van Kempen, <[email protected]>
111da177e4SLinus Torvalds * Donald Becker, <[email protected]>
12113aa838SAlan Cox * Alan Cox, <[email protected]>
131da177e4SLinus Torvalds * Richard Underwood
141da177e4SLinus Torvalds * Stefan Becker, <[email protected]>
151da177e4SLinus Torvalds * Jorge Cwik, <[email protected]>
161da177e4SLinus Torvalds * Arnt Gulbrandsen, <[email protected]>
171da177e4SLinus Torvalds *
181da177e4SLinus Torvalds * Fixes:
191da177e4SLinus Torvalds * Alan Cox : Commented a couple of minor bits of surplus code
201da177e4SLinus Torvalds * Alan Cox : Undefining IP_FORWARD doesn't include the code
211da177e4SLinus Torvalds * (just stops a compiler warning).
221da177e4SLinus Torvalds * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes
231da177e4SLinus Torvalds * are junked rather than corrupting things.
241da177e4SLinus Torvalds * Alan Cox : Frames to bad broadcast subnets are dumped
251da177e4SLinus Torvalds * We used to process them non broadcast and
261da177e4SLinus Torvalds * boy could that cause havoc.
271da177e4SLinus Torvalds * Alan Cox : ip_forward sets the free flag on the
281da177e4SLinus Torvalds * new frame it queues. Still crap because
291da177e4SLinus Torvalds * it copies the frame but at least it
301da177e4SLinus Torvalds * doesn't eat memory too.
311da177e4SLinus Torvalds * Alan Cox : Generic queue code and memory fixes.
321da177e4SLinus Torvalds * Fred Van Kempen : IP fragment support (borrowed from NET2E)
331da177e4SLinus Torvalds * Gerhard Koerting: Forward fragmented frames correctly.
341da177e4SLinus Torvalds * Gerhard Koerting: Fixes to my fix of the above 8-).
351da177e4SLinus Torvalds * Gerhard Koerting: IP interface addressing fix.
361da177e4SLinus Torvalds * Linus Torvalds : More robustness checks
371da177e4SLinus Torvalds * Alan Cox : Even more checks: Still not as robust as it ought to be
381da177e4SLinus Torvalds * Alan Cox : Save IP header pointer for later
391da177e4SLinus Torvalds * Alan Cox : ip option setting
401da177e4SLinus Torvalds * Alan Cox : Use ip_tos/ip_ttl settings
411da177e4SLinus Torvalds * Alan Cox : Fragmentation bogosity removed
421da177e4SLinus Torvalds * (Thanks to [email protected])
431da177e4SLinus Torvalds * Dmitry Gorodchanin : Send of a raw packet crash fix.
441da177e4SLinus Torvalds * Alan Cox : Silly ip bug when an overlength
451da177e4SLinus Torvalds * fragment turns up. Now frees the
461da177e4SLinus Torvalds * queue.
471da177e4SLinus Torvalds * Linus Torvalds/ : Memory leakage on fragmentation
481da177e4SLinus Torvalds * Alan Cox : handling.
491da177e4SLinus Torvalds * Gerhard Koerting: Forwarding uses IP priority hints
501da177e4SLinus Torvalds * Teemu Rantanen : Fragment problems.
511da177e4SLinus Torvalds * Alan Cox : General cleanup, comments and reformat
521da177e4SLinus Torvalds * Alan Cox : SNMP statistics
531da177e4SLinus Torvalds * Alan Cox : BSD address rule semantics. Also see
541da177e4SLinus Torvalds * UDP as there is a nasty checksum issue
551da177e4SLinus Torvalds * if you do things the wrong way.
561da177e4SLinus Torvalds * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file
571da177e4SLinus Torvalds * Alan Cox : IP options adjust sk->priority.
581da177e4SLinus Torvalds * Pedro Roque : Fix mtu/length error in ip_forward.
591da177e4SLinus Torvalds * Alan Cox : Avoid ip_chk_addr when possible.
601da177e4SLinus Torvalds * Richard Underwood : IP multicasting.
611da177e4SLinus Torvalds * Alan Cox : Cleaned up multicast handlers.
621da177e4SLinus Torvalds * Alan Cox : RAW sockets demultiplex in the BSD style.
631da177e4SLinus Torvalds * Gunther Mayer : Fix the SNMP reporting typo
641da177e4SLinus Torvalds * Alan Cox : Always in group 224.0.0.1
651da177e4SLinus Torvalds * Pauline Middelink : Fast ip_checksum update when forwarding
661da177e4SLinus Torvalds * Masquerading support.
671da177e4SLinus Torvalds * Alan Cox : Multicast loopback error for 224.0.0.1
681da177e4SLinus Torvalds * Alan Cox : IP_MULTICAST_LOOP option.
691da177e4SLinus Torvalds * Alan Cox : Use notifiers.
701da177e4SLinus Torvalds * Bjorn Ekwall : Removed ip_csum (from slhc.c too)
711da177e4SLinus Torvalds * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!)
721da177e4SLinus Torvalds * Stefan Becker : Send out ICMP HOST REDIRECT
731da177e4SLinus Torvalds * Arnt Gulbrandsen : ip_build_xmit
741da177e4SLinus Torvalds * Alan Cox : Per socket routing cache
751da177e4SLinus Torvalds * Alan Cox : Fixed routing cache, added header cache.
761da177e4SLinus Torvalds * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it.
771da177e4SLinus Torvalds * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net.
781da177e4SLinus Torvalds * Alan Cox : Incoming IP option handling.
791da177e4SLinus Torvalds * Alan Cox : Set saddr on raw output frames as per BSD.
801da177e4SLinus Torvalds * Alan Cox : Stopped broadcast source route explosions.
811da177e4SLinus Torvalds * Alan Cox : Can disable source routing
821da177e4SLinus Torvalds * Takeshi Sone : Masquerading didn't work.
831da177e4SLinus Torvalds * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible.
841da177e4SLinus Torvalds * Alan Cox : Memory leaks, tramples, misc debugging.
851da177e4SLinus Torvalds * Alan Cox : Fixed multicast (by popular demand 8))
861da177e4SLinus Torvalds * Alan Cox : Fixed forwarding (by even more popular demand 8))
871da177e4SLinus Torvalds * Alan Cox : Fixed SNMP statistics [I think]
881da177e4SLinus Torvalds * Gerhard Koerting : IP fragmentation forwarding fix
891da177e4SLinus Torvalds * Alan Cox : Device lock against page fault.
901da177e4SLinus Torvalds * Alan Cox : IP_HDRINCL facility.
911da177e4SLinus Torvalds * Werner Almesberger : Zero fragment bug
921da177e4SLinus Torvalds * Alan Cox : RAW IP frame length bug
931da177e4SLinus Torvalds * Alan Cox : Outgoing firewall on build_xmit
941da177e4SLinus Torvalds * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel
951da177e4SLinus Torvalds * Alan Cox : Multicast routing hooks
961da177e4SLinus Torvalds * Jos Vos : Do accounting *before* call_in_firewall
971da177e4SLinus Torvalds * Willy Konynenberg : Transparent proxying support
981da177e4SLinus Torvalds *
991da177e4SLinus Torvalds * To Fix:
1001da177e4SLinus Torvalds * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
1011da177e4SLinus Torvalds * and could be made very efficient with the addition of some virtual memory hacks to permit
1021da177e4SLinus Torvalds * the allocation of a buffer that can then be 'grown' by twiddling page tables.
1031da177e4SLinus Torvalds * Output fragmentation wants updating along with the buffer management to use a single
1041da177e4SLinus Torvalds * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
1051da177e4SLinus Torvalds * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
1061da177e4SLinus Torvalds * fragmentation anyway.
1071da177e4SLinus Torvalds */
1081da177e4SLinus Torvalds
109afd46503SJoe Perches #define pr_fmt(fmt) "IPv4: " fmt
110afd46503SJoe Perches
1111da177e4SLinus Torvalds #include <linux/module.h>
1121da177e4SLinus Torvalds #include <linux/types.h>
1131da177e4SLinus Torvalds #include <linux/kernel.h>
1141da177e4SLinus Torvalds #include <linux/string.h>
1151da177e4SLinus Torvalds #include <linux/errno.h>
1165a0e3ad6STejun Heo #include <linux/slab.h>
1171da177e4SLinus Torvalds
1181da177e4SLinus Torvalds #include <linux/net.h>
1191da177e4SLinus Torvalds #include <linux/socket.h>
1201da177e4SLinus Torvalds #include <linux/sockios.h>
1211da177e4SLinus Torvalds #include <linux/in.h>
1221da177e4SLinus Torvalds #include <linux/inet.h>
12314c85021SArnaldo Carvalho de Melo #include <linux/inetdevice.h>
1241da177e4SLinus Torvalds #include <linux/netdevice.h>
1251da177e4SLinus Torvalds #include <linux/etherdevice.h>
1260e219ae4SPaolo Abeni #include <linux/indirect_call_wrapper.h>
1271da177e4SLinus Torvalds
1281da177e4SLinus Torvalds #include <net/snmp.h>
1291da177e4SLinus Torvalds #include <net/ip.h>
1301da177e4SLinus Torvalds #include <net/protocol.h>
1311da177e4SLinus Torvalds #include <net/route.h>
1321da177e4SLinus Torvalds #include <linux/skbuff.h>
1331da177e4SLinus Torvalds #include <net/sock.h>
1341da177e4SLinus Torvalds #include <net/arp.h>
1351da177e4SLinus Torvalds #include <net/icmp.h>
1361da177e4SLinus Torvalds #include <net/raw.h>
1371da177e4SLinus Torvalds #include <net/checksum.h>
1381f07d03eSEric Dumazet #include <net/inet_ecn.h>
1391da177e4SLinus Torvalds #include <linux/netfilter_ipv4.h>
1401da177e4SLinus Torvalds #include <net/xfrm.h>
1411da177e4SLinus Torvalds #include <linux/mroute.h>
1421da177e4SLinus Torvalds #include <linux/netlink.h>
143f38a9eb1SThomas Graf #include <net/dst_metadata.h>
1441da177e4SLinus Torvalds
1451da177e4SLinus Torvalds /*
14666018506SEric Dumazet * Process Router Attention IP option (RFC 2113)
1471da177e4SLinus Torvalds */
ip_call_ra_chain(struct sk_buff * skb)148ba57b4dbSDavid S. Miller bool ip_call_ra_chain(struct sk_buff *skb)
1491da177e4SLinus Torvalds {
1501da177e4SLinus Torvalds struct ip_ra_chain *ra;
151eddc9ec5SArnaldo Carvalho de Melo u8 protocol = ip_hdr(skb)->protocol;
1521da177e4SLinus Torvalds struct sock *last = NULL;
153cb84663eSDenis V. Lunev struct net_device *dev = skb->dev;
15437fcbab6SEric W. Biederman struct net *net = dev_net(dev);
1551da177e4SLinus Torvalds
1565796ef75SKirill Tkhai for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) {
1571da177e4SLinus Torvalds struct sock *sk = ra->sk;
1581da177e4SLinus Torvalds
1591da177e4SLinus Torvalds /* If socket is bound to an interface, only report
1601da177e4SLinus Torvalds * the packet if it came from that interface.
1611da177e4SLinus Torvalds */
162c720c7e8SEric Dumazet if (sk && inet_sk(sk)->inet_num == protocol &&
1631da177e4SLinus Torvalds (!sk->sk_bound_dev_if ||
1645796ef75SKirill Tkhai sk->sk_bound_dev_if == dev->ifindex)) {
16556f8a75cSPaul Gortmaker if (ip_is_fragment(ip_hdr(skb))) {
16619bcf9f2SEric W. Biederman if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
167ba57b4dbSDavid S. Miller return true;
1681da177e4SLinus Torvalds }
1691da177e4SLinus Torvalds if (last) {
1701da177e4SLinus Torvalds struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1711da177e4SLinus Torvalds if (skb2)
1721da177e4SLinus Torvalds raw_rcv(last, skb2);
1731da177e4SLinus Torvalds }
1741da177e4SLinus Torvalds last = sk;
1751da177e4SLinus Torvalds }
1761da177e4SLinus Torvalds }
1771da177e4SLinus Torvalds
1781da177e4SLinus Torvalds if (last) {
1791da177e4SLinus Torvalds raw_rcv(last, skb);
180ba57b4dbSDavid S. Miller return true;
1811da177e4SLinus Torvalds }
182ba57b4dbSDavid S. Miller return false;
1831da177e4SLinus Torvalds }
1841da177e4SLinus Torvalds
1850e219ae4SPaolo Abeni INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *));
1860e219ae4SPaolo Abeni INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *));
ip_protocol_deliver_rcu(struct net * net,struct sk_buff * skb,int protocol)18768cb7d53SPaolo Abeni void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
1881da177e4SLinus Torvalds {
18932613090SAlexey Dobriyan const struct net_protocol *ipprot;
19068cb7d53SPaolo Abeni int raw, ret;
1911da177e4SLinus Torvalds
1921da177e4SLinus Torvalds resubmit:
1937bc54c90SPavel Emelyanov raw = raw_local_deliver(skb, protocol);
1947bc54c90SPavel Emelyanov
195f9242b6bSDavid S. Miller ipprot = rcu_dereference(inet_protos[protocol]);
19600db4124SIan Morris if (ipprot) {
197b59c2701SPatrick McHardy if (!ipprot->no_policy) {
198b59c2701SPatrick McHardy if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
19910580c47SMenglong Dong kfree_skb_reason(skb,
20010580c47SMenglong Dong SKB_DROP_REASON_XFRM_POLICY);
20168cb7d53SPaolo Abeni return;
2021da177e4SLinus Torvalds }
203895b5c9fSFlorian Westphal nf_reset_ct(skb);
204b59c2701SPatrick McHardy }
2050e219ae4SPaolo Abeni ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
2060e219ae4SPaolo Abeni skb);
2071da177e4SLinus Torvalds if (ret < 0) {
2081da177e4SLinus Torvalds protocol = -ret;
2091da177e4SLinus Torvalds goto resubmit;
2101da177e4SLinus Torvalds }
211b45386efSEric Dumazet __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
2121da177e4SLinus Torvalds } else {
2137bc54c90SPavel Emelyanov if (!raw) {
2141da177e4SLinus Torvalds if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
215b45386efSEric Dumazet __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
2161da177e4SLinus Torvalds icmp_send(skb, ICMP_DEST_UNREACH,
2171da177e4SLinus Torvalds ICMP_PROT_UNREACH, 0);
2181da177e4SLinus Torvalds }
21910580c47SMenglong Dong kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO);
220d8c6f4b9SNeil Horman } else {
221b45386efSEric Dumazet __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
222d8c6f4b9SNeil Horman consume_skb(skb);
223d8c6f4b9SNeil Horman }
2241da177e4SLinus Torvalds }
2251da177e4SLinus Torvalds }
22668cb7d53SPaolo Abeni
ip_local_deliver_finish(struct net * net,struct sock * sk,struct sk_buff * skb)22768cb7d53SPaolo Abeni static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
22868cb7d53SPaolo Abeni {
229cd14e9b7SMartin KaFai Lau skb_clear_delivery_time(skb);
23068cb7d53SPaolo Abeni __skb_pull(skb, skb_network_header_len(skb));
23168cb7d53SPaolo Abeni
23268cb7d53SPaolo Abeni rcu_read_lock();
23368cb7d53SPaolo Abeni ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
2341da177e4SLinus Torvalds rcu_read_unlock();
2351da177e4SLinus Torvalds
2361da177e4SLinus Torvalds return 0;
2371da177e4SLinus Torvalds }
2381da177e4SLinus Torvalds
2391da177e4SLinus Torvalds /*
2401da177e4SLinus Torvalds * Deliver IP Packets to the higher protocol layers.
2411da177e4SLinus Torvalds */
ip_local_deliver(struct sk_buff * skb)2421da177e4SLinus Torvalds int ip_local_deliver(struct sk_buff *skb)
2431da177e4SLinus Torvalds {
2441da177e4SLinus Torvalds /*
2451da177e4SLinus Torvalds * Reassemble IP fragments.
2461da177e4SLinus Torvalds */
24719bcf9f2SEric W. Biederman struct net *net = dev_net(skb->dev);
2481da177e4SLinus Torvalds
24956f8a75cSPaul Gortmaker if (ip_is_fragment(ip_hdr(skb))) {
25019bcf9f2SEric W. Biederman if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
2511da177e4SLinus Torvalds return 0;
2521da177e4SLinus Torvalds }
2531da177e4SLinus Torvalds
25429a26a56SEric W. Biederman return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
25519bcf9f2SEric W. Biederman net, NULL, skb, skb->dev, NULL,
2561da177e4SLinus Torvalds ip_local_deliver_finish);
2571da177e4SLinus Torvalds }
258e43b2190SBrian Vazquez EXPORT_SYMBOL(ip_local_deliver);
2591da177e4SLinus Torvalds
ip_rcv_options(struct sk_buff * skb,struct net_device * dev)2608c83f2dfSStephen Suryaputra static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
261d245407eSThomas Graf {
262d245407eSThomas Graf struct ip_options *opt;
263b71d1d42SEric Dumazet const struct iphdr *iph;
264d245407eSThomas Graf
265d245407eSThomas Graf /* It looks as overkill, because not all
266d245407eSThomas Graf IP options require packet mangling.
267d245407eSThomas Graf But it is the easiest for now, especially taking
268d245407eSThomas Graf into account that combination of IP options
269d245407eSThomas Graf and running sniffer is extremely rare condition.
270d245407eSThomas Graf --ANK (980813)
271d245407eSThomas Graf */
272d245407eSThomas Graf if (skb_cow(skb, skb_headroom(skb))) {
273b45386efSEric Dumazet __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS);
274d245407eSThomas Graf goto drop;
275d245407eSThomas Graf }
276d245407eSThomas Graf
277eddc9ec5SArnaldo Carvalho de Melo iph = ip_hdr(skb);
27822aba383SDenis V. Lunev opt = &(IPCB(skb)->opt);
27922aba383SDenis V. Lunev opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
280d245407eSThomas Graf
281c346dca1SYOSHIFUJI Hideaki if (ip_options_compile(dev_net(dev), opt, skb)) {
282b45386efSEric Dumazet __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
283d245407eSThomas Graf goto drop;
284d245407eSThomas Graf }
285d245407eSThomas Graf
286d245407eSThomas Graf if (unlikely(opt->srr)) {
2876e8b11b4SEric Dumazet struct in_device *in_dev = __in_dev_get_rcu(dev);
2886e8b11b4SEric Dumazet
289d245407eSThomas Graf if (in_dev) {
290d245407eSThomas Graf if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
291e87cc472SJoe Perches if (IN_DEV_LOG_MARTIANS(in_dev))
292e87cc472SJoe Perches net_info_ratelimited("source route option %pI4 -> %pI4\n",
293e87cc472SJoe Perches &iph->saddr,
294e87cc472SJoe Perches &iph->daddr);
295d245407eSThomas Graf goto drop;
296d245407eSThomas Graf }
297d245407eSThomas Graf }
298d245407eSThomas Graf
2998c83f2dfSStephen Suryaputra if (ip_options_rcv_srr(skb, dev))
300d245407eSThomas Graf goto drop;
301d245407eSThomas Graf }
302d245407eSThomas Graf
3036a91395fSDavid S. Miller return false;
304d245407eSThomas Graf drop:
3056a91395fSDavid S. Miller return true;
306d245407eSThomas Graf }
307d245407eSThomas Graf
ip_can_use_hint(const struct sk_buff * skb,const struct iphdr * iph,const struct sk_buff * hint)30802b24941SPaolo Abeni static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
30902b24941SPaolo Abeni const struct sk_buff *hint)
31002b24941SPaolo Abeni {
31102b24941SPaolo Abeni return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr &&
31202b24941SPaolo Abeni ip_hdr(hint)->tos == iph->tos;
31302b24941SPaolo Abeni }
31402b24941SPaolo Abeni
31511052589SKuniyuki Iwashima int tcp_v4_early_demux(struct sk_buff *skb);
31611052589SKuniyuki Iwashima int udp_v4_early_demux(struct sk_buff *skb);
ip_rcv_finish_core(struct net * net,struct sk_buff * skb,struct net_device * dev,const struct sk_buff * hint)317*5df7ca0bSYu Tian static int ip_rcv_finish_core(struct net *net,
31802b24941SPaolo Abeni struct sk_buff *skb, struct net_device *dev,
31902b24941SPaolo Abeni const struct sk_buff *hint)
3201da177e4SLinus Torvalds {
321eddc9ec5SArnaldo Carvalho de Melo const struct iphdr *iph = ip_hdr(skb);
322c1f166d1SMenglong Dong int err, drop_reason;
3237487449cSPaolo Abeni struct rtable *rt;
324c1f166d1SMenglong Dong
32502b24941SPaolo Abeni if (ip_can_use_hint(skb, iph, hint)) {
326479aed04SMenglong Dong drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr,
3272b78d306SGuillaume Nault ip4h_dscp(iph), dev, hint);
328479aed04SMenglong Dong if (unlikely(drop_reason))
32902b24941SPaolo Abeni goto drop_error;
33002b24941SPaolo Abeni }
33102b24941SPaolo Abeni
332479aed04SMenglong Dong drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
33311052589SKuniyuki Iwashima if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
33463e51b6aSEric Dumazet !skb_dst(skb) &&
33563e51b6aSEric Dumazet !skb->sk &&
33663e51b6aSEric Dumazet !ip_is_fragment(iph)) {
33711052589SKuniyuki Iwashima switch (iph->protocol) {
33811052589SKuniyuki Iwashima case IPPROTO_TCP:
33911052589SKuniyuki Iwashima if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) {
34011052589SKuniyuki Iwashima tcp_v4_early_demux(skb);
34141063e9dSDavid S. Miller
3429cb429d6SEric Dumazet /* must reload iph, skb->head might have changed */
3439cb429d6SEric Dumazet iph = ip_hdr(skb);
3449cb429d6SEric Dumazet }
34511052589SKuniyuki Iwashima break;
34611052589SKuniyuki Iwashima case IPPROTO_UDP:
34711052589SKuniyuki Iwashima if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) {
34811052589SKuniyuki Iwashima err = udp_v4_early_demux(skb);
34911052589SKuniyuki Iwashima if (unlikely(err))
35011052589SKuniyuki Iwashima goto drop_error;
35111052589SKuniyuki Iwashima
35211052589SKuniyuki Iwashima /* must reload iph, skb->head might have changed */
35311052589SKuniyuki Iwashima iph = ip_hdr(skb);
35411052589SKuniyuki Iwashima }
35511052589SKuniyuki Iwashima break;
35611052589SKuniyuki Iwashima }
3576648bd7eSAlexander Duyck }
35841063e9dSDavid S. Miller
359160eb5a6SDavid S. Miller /*
360160eb5a6SDavid S. Miller * Initialise the virtual path cache for the packet. It describes
361160eb5a6SDavid S. Miller * how the packet travels inside Linux networking.
362160eb5a6SDavid S. Miller */
363f38a9eb1SThomas Graf if (!skb_valid_dst(skb)) {
36482d9983eSMenglong Dong drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr,
36566fb6386SGuillaume Nault ip4h_dscp(iph), dev);
36682d9983eSMenglong Dong if (unlikely(drop_reason))
3677487449cSPaolo Abeni goto drop_error;
36882d9983eSMenglong Dong drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
3693a591318SEyal Birger } else {
3703a591318SEyal Birger struct in_device *in_dev = __in_dev_get_rcu(dev);
3713a591318SEyal Birger
3723a591318SEyal Birger if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY))
3733a591318SEyal Birger IPCB(skb)->flags |= IPSKB_NOPOLICY;
3742c2910a4SDietmar Eggemann }
3751da177e4SLinus Torvalds
376c7066f70SPatrick McHardy #ifdef CONFIG_IP_ROUTE_CLASSID
377adf30907SEric Dumazet if (unlikely(skb_dst(skb)->tclassid)) {
3787a9b2d59SEric Dumazet struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
379adf30907SEric Dumazet u32 idx = skb_dst(skb)->tclassid;
3801da177e4SLinus Torvalds st[idx&0xFF].o_packets++;
3811da177e4SLinus Torvalds st[idx&0xFF].o_bytes += skb->len;
3821da177e4SLinus Torvalds st[(idx>>16)&0xFF].i_packets++;
3831da177e4SLinus Torvalds st[(idx>>16)&0xFF].i_bytes += skb->len;
3841da177e4SLinus Torvalds }
3851da177e4SLinus Torvalds #endif
3861da177e4SLinus Torvalds
3878c83f2dfSStephen Suryaputra if (iph->ihl > 5 && ip_rcv_options(skb, dev))
3881da177e4SLinus Torvalds goto drop;
3891da177e4SLinus Torvalds
390511c3f92SEric Dumazet rt = skb_rtable(skb);
391edf391ffSNeil Horman if (rt->rt_type == RTN_MULTICAST) {
392b15084ecSEric Dumazet __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);
39312b74dfaSJohannes Berg } else if (rt->rt_type == RTN_BROADCAST) {
394b15084ecSEric Dumazet __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
39512b74dfaSJohannes Berg } else if (skb->pkt_type == PACKET_BROADCAST ||
39612b74dfaSJohannes Berg skb->pkt_type == PACKET_MULTICAST) {
397d6f64d72SMark Tomlinson struct in_device *in_dev = __in_dev_get_rcu(dev);
39812b74dfaSJohannes Berg
39912b74dfaSJohannes Berg /* RFC 1122 3.3.6:
40012b74dfaSJohannes Berg *
40112b74dfaSJohannes Berg * When a host sends a datagram to a link-layer broadcast
40212b74dfaSJohannes Berg * address, the IP destination address MUST be a legal IP
40312b74dfaSJohannes Berg * broadcast or IP multicast address.
40412b74dfaSJohannes Berg *
40512b74dfaSJohannes Berg * A host SHOULD silently discard a datagram that is received
40612b74dfaSJohannes Berg * via a link-layer broadcast (see Section 2.4) but does not
40712b74dfaSJohannes Berg * specify an IP multicast or broadcast destination address.
40812b74dfaSJohannes Berg *
40912b74dfaSJohannes Berg * This doesn't explicitly say L2 *broadcast*, but broadcast is
41012b74dfaSJohannes Berg * in a way a form of multicast and the most common use case for
41112b74dfaSJohannes Berg * this is 802.11 protecting against cross-station spoofing (the
41212b74dfaSJohannes Berg * so-called "hole-196" attack) so do it for both.
41312b74dfaSJohannes Berg */
41412b74dfaSJohannes Berg if (in_dev &&
415c1f166d1SMenglong Dong IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) {
416c1f166d1SMenglong Dong drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST;
41712b74dfaSJohannes Berg goto drop;
41812b74dfaSJohannes Berg }
419c1f166d1SMenglong Dong }
4205506b54bSMitsuru Chinen
4215fa12739SEdward Cree return NET_RX_SUCCESS;
4221da177e4SLinus Torvalds
4231da177e4SLinus Torvalds drop:
424c1f166d1SMenglong Dong kfree_skb_reason(skb, drop_reason);
4251da177e4SLinus Torvalds return NET_RX_DROP;
4267487449cSPaolo Abeni
4277487449cSPaolo Abeni drop_error:
42837653a0bSMenglong Dong if (drop_reason == SKB_DROP_REASON_IP_RPFILTER)
4297487449cSPaolo Abeni __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
4307487449cSPaolo Abeni goto drop;
4311da177e4SLinus Torvalds }
4321da177e4SLinus Torvalds
ip_rcv_finish(struct net * net,struct sock * sk,struct sk_buff * skb)4335fa12739SEdward Cree static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
4345fa12739SEdward Cree {
435a1fd1ad2SDavid Ahern struct net_device *dev = skb->dev;
436efe6aacaSEdward Cree int ret;
4375fa12739SEdward Cree
438efe6aacaSEdward Cree /* if ingress device is enslaved to an L3 master device pass the
439efe6aacaSEdward Cree * skb to its handler for processing
440efe6aacaSEdward Cree */
441efe6aacaSEdward Cree skb = l3mdev_ip_rcv(skb);
442efe6aacaSEdward Cree if (!skb)
443efe6aacaSEdward Cree return NET_RX_SUCCESS;
444efe6aacaSEdward Cree
445*5df7ca0bSYu Tian ret = ip_rcv_finish_core(net, skb, dev, NULL);
4465fa12739SEdward Cree if (ret != NET_RX_DROP)
4475fa12739SEdward Cree ret = dst_input(skb);
4485fa12739SEdward Cree return ret;
4495fa12739SEdward Cree }
4505fa12739SEdward Cree
4511da177e4SLinus Torvalds /*
4521da177e4SLinus Torvalds * Main IP Receive routine.
4531da177e4SLinus Torvalds */
ip_rcv_core(struct sk_buff * skb,struct net * net)45417266ee9SEdward Cree static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
4551da177e4SLinus Torvalds {
456b71d1d42SEric Dumazet const struct iphdr *iph;
45733cba429SMenglong Dong int drop_reason;
45858615242SThomas Graf u32 len;
4591da177e4SLinus Torvalds
4601da177e4SLinus Torvalds /* When the interface is in promisc. mode, drop all the crap
4611da177e4SLinus Torvalds * that it receives, do not try to analyse it.
4621da177e4SLinus Torvalds */
46333cba429SMenglong Dong if (skb->pkt_type == PACKET_OTHERHOST) {
464794c24e9SJeffrey Ji dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
46533cba429SMenglong Dong drop_reason = SKB_DROP_REASON_OTHERHOST;
4661da177e4SLinus Torvalds goto drop;
46733cba429SMenglong Dong }
4681da177e4SLinus Torvalds
469b15084ecSEric Dumazet __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
4701da177e4SLinus Torvalds
47151456b29SIan Morris skb = skb_share_check(skb, GFP_ATOMIC);
47251456b29SIan Morris if (!skb) {
473b45386efSEric Dumazet __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
4741da177e4SLinus Torvalds goto out;
4751da177e4SLinus Torvalds }
4761da177e4SLinus Torvalds
47733cba429SMenglong Dong drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
4781da177e4SLinus Torvalds if (!pskb_may_pull(skb, sizeof(struct iphdr)))
4791da177e4SLinus Torvalds goto inhdr_error;
4801da177e4SLinus Torvalds
481eddc9ec5SArnaldo Carvalho de Melo iph = ip_hdr(skb);
4821da177e4SLinus Torvalds
4831da177e4SLinus Torvalds /*
484c67fa027SJ.H.M. Dassen (Ray) * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
4851da177e4SLinus Torvalds *
4861da177e4SLinus Torvalds * Is the datagram acceptable?
4871da177e4SLinus Torvalds *
4881da177e4SLinus Torvalds * 1. Length at least the size of an ip header
4891da177e4SLinus Torvalds * 2. Version of 4
4901da177e4SLinus Torvalds * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
4911da177e4SLinus Torvalds * 4. Doesn't have a bogus length
4921da177e4SLinus Torvalds */
4931da177e4SLinus Torvalds
4941da177e4SLinus Torvalds if (iph->ihl < 5 || iph->version != 4)
4951da177e4SLinus Torvalds goto inhdr_error;
4961da177e4SLinus Torvalds
4971f07d03eSEric Dumazet BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
4981f07d03eSEric Dumazet BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
4991f07d03eSEric Dumazet BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
50098f61995SEric Dumazet __IP_ADD_STATS(net,
5011f07d03eSEric Dumazet IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
5021f07d03eSEric Dumazet max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
5031f07d03eSEric Dumazet
5041da177e4SLinus Torvalds if (!pskb_may_pull(skb, iph->ihl*4))
5051da177e4SLinus Torvalds goto inhdr_error;
5061da177e4SLinus Torvalds
507eddc9ec5SArnaldo Carvalho de Melo iph = ip_hdr(skb);
5081da177e4SLinus Torvalds
509e9c60422SThomas Graf if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
5106a5dc9e5SEric Dumazet goto csum_error;
5111da177e4SLinus Torvalds
512b1a78b9bSXin Long len = iph_totlen(skb, iph);
513704aed53SMitsuru Chinen if (skb->len < len) {
51433cba429SMenglong Dong drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
515b45386efSEric Dumazet __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
516704aed53SMitsuru Chinen goto drop;
517704aed53SMitsuru Chinen } else if (len < (iph->ihl*4))
5181da177e4SLinus Torvalds goto inhdr_error;
5191da177e4SLinus Torvalds
5201da177e4SLinus Torvalds /* Our transport medium may have padded the buffer out. Now we know it
5211da177e4SLinus Torvalds * is IP we can trim to the true length of the frame.
5221da177e4SLinus Torvalds * Note this now means skb->len holds ntohs(iph->tot_len).
5231da177e4SLinus Torvalds */
5241da177e4SLinus Torvalds if (pskb_trim_rcsum(skb, len)) {
525b45386efSEric Dumazet __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
5261da177e4SLinus Torvalds goto drop;
5271da177e4SLinus Torvalds }
5281da177e4SLinus Torvalds
5296c57f045SRoss Lagerwall iph = ip_hdr(skb);
53021d1196aSEric Dumazet skb->transport_header = skb->network_header + iph->ihl*4;
53121d1196aSEric Dumazet
53253602f92SStephen Hemminger /* Remove any debris in the socket control block */
533d569f1d7SGuillaume Chazarain memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
5340b922b7aSDavid Ahern IPCB(skb)->iif = skb->skb_iif;
53553602f92SStephen Hemminger
53671f9dacdSHerbert Xu /* Must drop socket now because of tproxy. */
537cf7fbe66SJoe Stringer if (!skb_sk_is_prefetched(skb))
53871f9dacdSHerbert Xu skb_orphan(skb);
53971f9dacdSHerbert Xu
54017266ee9SEdward Cree return skb;
5411da177e4SLinus Torvalds
5426a5dc9e5SEric Dumazet csum_error:
54333cba429SMenglong Dong drop_reason = SKB_DROP_REASON_IP_CSUM;
544b45386efSEric Dumazet __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
5451da177e4SLinus Torvalds inhdr_error:
54633cba429SMenglong Dong if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED)
54733cba429SMenglong Dong drop_reason = SKB_DROP_REASON_IP_INHDR;
548b45386efSEric Dumazet __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
5491da177e4SLinus Torvalds drop:
55033cba429SMenglong Dong kfree_skb_reason(skb, drop_reason);
5511da177e4SLinus Torvalds out:
55217266ee9SEdward Cree return NULL;
55317266ee9SEdward Cree }
55417266ee9SEdward Cree
55517266ee9SEdward Cree /*
55617266ee9SEdward Cree * IP receive entry point
55717266ee9SEdward Cree */
ip_rcv(struct sk_buff * skb,struct net_device * dev,struct packet_type * pt,struct net_device * orig_dev)55817266ee9SEdward Cree int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
55917266ee9SEdward Cree struct net_device *orig_dev)
56017266ee9SEdward Cree {
56117266ee9SEdward Cree struct net *net = dev_net(dev);
56217266ee9SEdward Cree
56317266ee9SEdward Cree skb = ip_rcv_core(skb, net);
56417266ee9SEdward Cree if (skb == NULL)
5651da177e4SLinus Torvalds return NET_RX_DROP;
566fb1b6999SYang Wei
56717266ee9SEdward Cree return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
56817266ee9SEdward Cree net, NULL, skb, dev, NULL,
56917266ee9SEdward Cree ip_rcv_finish);
57017266ee9SEdward Cree }
57117266ee9SEdward Cree
ip_sublist_rcv_finish(struct list_head * head)5725fa12739SEdward Cree static void ip_sublist_rcv_finish(struct list_head *head)
57317266ee9SEdward Cree {
57417266ee9SEdward Cree struct sk_buff *skb, *next;
57517266ee9SEdward Cree
5760761680dSJesper Dangaard Brouer list_for_each_entry_safe(skb, next, head, list) {
577992cba7eSDavid S. Miller skb_list_del_init(skb);
5785fa12739SEdward Cree dst_input(skb);
5795fa12739SEdward Cree }
5800761680dSJesper Dangaard Brouer }
5815fa12739SEdward Cree
ip_extract_route_hint(const struct net * net,struct sk_buff * skb,int rt_type)58202b24941SPaolo Abeni static struct sk_buff *ip_extract_route_hint(const struct net *net,
58302b24941SPaolo Abeni struct sk_buff *skb, int rt_type)
58402b24941SPaolo Abeni {
5856ac66cb0SSriram Yagnaraman if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST ||
5866ac66cb0SSriram Yagnaraman IPCB(skb)->flags & IPSKB_MULTIPATH)
58702b24941SPaolo Abeni return NULL;
58802b24941SPaolo Abeni
58902b24941SPaolo Abeni return skb;
59002b24941SPaolo Abeni }
59102b24941SPaolo Abeni
ip_list_rcv_finish(struct net * net,struct list_head * head)592*5df7ca0bSYu Tian static void ip_list_rcv_finish(struct net *net, struct list_head *head)
5935fa12739SEdward Cree {
59402b24941SPaolo Abeni struct sk_buff *skb, *next, *hint = NULL;
5955fa12739SEdward Cree struct dst_entry *curr_dst = NULL;
596cecbe5c8SHongbo Li LIST_HEAD(sublist);
5975fa12739SEdward Cree
5985fa12739SEdward Cree list_for_each_entry_safe(skb, next, head, list) {
599a1fd1ad2SDavid Ahern struct net_device *dev = skb->dev;
6005fa12739SEdward Cree struct dst_entry *dst;
6015fa12739SEdward Cree
60222f6bbb7SEdward Cree skb_list_del_init(skb);
603efe6aacaSEdward Cree /* if ingress device is enslaved to an L3 master device pass the
604efe6aacaSEdward Cree * skb to its handler for processing
605efe6aacaSEdward Cree */
606efe6aacaSEdward Cree skb = l3mdev_ip_rcv(skb);
607efe6aacaSEdward Cree if (!skb)
608efe6aacaSEdward Cree continue;
609*5df7ca0bSYu Tian if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP)
6105fa12739SEdward Cree continue;
6115fa12739SEdward Cree
6125fa12739SEdward Cree dst = skb_dst(skb);
6135fa12739SEdward Cree if (curr_dst != dst) {
61402b24941SPaolo Abeni hint = ip_extract_route_hint(net, skb,
61505d6d492SEric Dumazet dst_rtable(dst)->rt_type);
61602b24941SPaolo Abeni
6175fa12739SEdward Cree /* dispatch old sublist */
6185fa12739SEdward Cree if (!list_empty(&sublist))
6195fa12739SEdward Cree ip_sublist_rcv_finish(&sublist);
6205fa12739SEdward Cree /* start new sublist */
621a4ca8b7dSEdward Cree INIT_LIST_HEAD(&sublist);
6225fa12739SEdward Cree curr_dst = dst;
6235fa12739SEdward Cree }
624a4ca8b7dSEdward Cree list_add_tail(&skb->list, &sublist);
6255fa12739SEdward Cree }
6265fa12739SEdward Cree /* dispatch final sublist */
627a4ca8b7dSEdward Cree ip_sublist_rcv_finish(&sublist);
6285fa12739SEdward Cree }
6295fa12739SEdward Cree
ip_sublist_rcv(struct list_head * head,struct net_device * dev,struct net * net)6305fa12739SEdward Cree static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
6315fa12739SEdward Cree struct net *net)
6325fa12739SEdward Cree {
63317266ee9SEdward Cree NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
63417266ee9SEdward Cree head, dev, NULL, ip_rcv_finish);
635*5df7ca0bSYu Tian ip_list_rcv_finish(net, head);
63617266ee9SEdward Cree }
63717266ee9SEdward Cree
63817266ee9SEdward Cree /* Receive a list of IP packets */
ip_list_rcv(struct list_head * head,struct packet_type * pt,struct net_device * orig_dev)63917266ee9SEdward Cree void ip_list_rcv(struct list_head *head, struct packet_type *pt,
64017266ee9SEdward Cree struct net_device *orig_dev)
64117266ee9SEdward Cree {
64217266ee9SEdward Cree struct net_device *curr_dev = NULL;
64317266ee9SEdward Cree struct net *curr_net = NULL;
64417266ee9SEdward Cree struct sk_buff *skb, *next;
645cecbe5c8SHongbo Li LIST_HEAD(sublist);
64617266ee9SEdward Cree
64717266ee9SEdward Cree list_for_each_entry_safe(skb, next, head, list) {
64817266ee9SEdward Cree struct net_device *dev = skb->dev;
64917266ee9SEdward Cree struct net *net = dev_net(dev);
65017266ee9SEdward Cree
65122f6bbb7SEdward Cree skb_list_del_init(skb);
65217266ee9SEdward Cree skb = ip_rcv_core(skb, net);
65317266ee9SEdward Cree if (skb == NULL)
65417266ee9SEdward Cree continue;
65517266ee9SEdward Cree
65617266ee9SEdward Cree if (curr_dev != dev || curr_net != net) {
65717266ee9SEdward Cree /* dispatch old sublist */
65817266ee9SEdward Cree if (!list_empty(&sublist))
659a4ca8b7dSEdward Cree ip_sublist_rcv(&sublist, curr_dev, curr_net);
66017266ee9SEdward Cree /* start new sublist */
661a4ca8b7dSEdward Cree INIT_LIST_HEAD(&sublist);
66217266ee9SEdward Cree curr_dev = dev;
66317266ee9SEdward Cree curr_net = net;
66417266ee9SEdward Cree }
665a4ca8b7dSEdward Cree list_add_tail(&skb->list, &sublist);
66617266ee9SEdward Cree }
66717266ee9SEdward Cree /* dispatch final sublist */
66851210ad5SFlorian Westphal if (!list_empty(&sublist))
669a4ca8b7dSEdward Cree ip_sublist_rcv(&sublist, curr_dev, curr_net);
6701da177e4SLinus Torvalds }
671