CoCalc -- ip6_output.c

GitHub Repository: torvalds/linux
Path: blob/master/net/ipv6/ip6_output.c
²⁹²⁶⁵ views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
 *	IPv6 output functions
4
 *	Linux INET6 implementation
5
 *
6
 *	Authors:
7
 *	Pedro Roque		<[email protected]>
8
 *
9
 *	Based on linux/net/ipv4/ip_output.c
10
 *
11
 *	Changes:
12
 *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13
 *				extension headers are implemented.
14
 *				route changes now work.
15
 *				ip6_forward does not confuse sniffers.
16
 *				etc.
17
 *
18
 *      H. von Brand    :       Added missing #include <linux/string.h>
19
 *	Imran Patel	:	frag id should be in NBO
20
 *      Kazunori MIYAZAWA @USAGI
21
 *			:       add ip6_append_data and related functions
22
 *				for datagram xmit
23
 */
24

25
#include <linux/errno.h>
26
#include <linux/kernel.h>
27
#include <linux/string.h>
28
#include <linux/socket.h>
29
#include <linux/net.h>
30
#include <linux/netdevice.h>
31
#include <linux/if_arp.h>
32
#include <linux/in6.h>
33
#include <linux/tcp.h>
34
#include <linux/route.h>
35
#include <linux/module.h>
36
#include <linux/slab.h>
37

38
#include <linux/bpf-cgroup.h>
39
#include <linux/netfilter.h>
40
#include <linux/netfilter_ipv6.h>
41

42
#include <net/sock.h>
43
#include <net/snmp.h>
44

45
#include <net/gso.h>
46
#include <net/ipv6.h>
47
#include <net/ndisc.h>
48
#include <net/protocol.h>
49
#include <net/ip6_route.h>
50
#include <net/addrconf.h>
51
#include <net/rawv6.h>
52
#include <net/icmp.h>
53
#include <net/xfrm.h>
54
#include <net/checksum.h>
55
#include <linux/mroute6.h>
56
#include <net/l3mdev.h>
57
#include <net/lwtunnel.h>
58
#include <net/ip_tunnels.h>
59

60
static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61
{
62
	struct dst_entry *dst = skb_dst(skb);
63
	struct net_device *dev = dst_dev_rcu(dst);
64
	struct inet6_dev *idev = ip6_dst_idev(dst);
65
	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66
	const struct in6_addr *daddr, *nexthop;
67
	struct ipv6hdr *hdr;
68
	struct neighbour *neigh;
69
	int ret;
70

71
	/* Be paranoid, rather than too clever. */
72
	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73
		/* idev stays alive because we hold rcu_read_lock(). */
74
		skb = skb_expand_head(skb, hh_len);
75
		if (!skb) {
76
			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
77
			return -ENOMEM;
78
		}
79
	}
80

81
	hdr = ipv6_hdr(skb);
82
	daddr = &hdr->daddr;
83
	if (ipv6_addr_is_multicast(daddr)) {
84
		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
85
		    ((mroute6_is_socket(net, skb) &&
86
		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
87
		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
88
			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
89

90
			/* Do not check for IFF_ALLMULTI; multicast routing
91
			   is not supported in any case.
92
			 */
93
			if (newskb)
94
				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
95
					net, sk, newskb, NULL, newskb->dev,
96
					dev_loopback_xmit);
97

98
			if (hdr->hop_limit == 0) {
99
				IP6_INC_STATS(net, idev,
100
					      IPSTATS_MIB_OUTDISCARDS);
101
				kfree_skb(skb);
102
				return 0;
103
			}
104
		}
105

106
		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
107
		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
108
		    !(dev->flags & IFF_LOOPBACK)) {
109
			kfree_skb(skb);
110
			return 0;
111
		}
112
	}
113

114
	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
115
		int res = lwtunnel_xmit(skb);
116

117
		if (res != LWTUNNEL_XMIT_CONTINUE)
118
			return res;
119
	}
120

121
	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
122

123
	nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
124
	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
125

126
	if (IS_ERR_OR_NULL(neigh)) {
127
		if (unlikely(!neigh))
128
			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
129
		if (IS_ERR(neigh)) {
130
			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
131
			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
132
			return -EINVAL;
133
		}
134
	}
135
	sock_confirm_neigh(skb, neigh);
136
	ret = neigh_output(neigh, skb, false);
137
	return ret;
138
}
139

140
static int
141
ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142
				    struct sk_buff *skb, unsigned int mtu)
143
{
144
	struct sk_buff *segs, *nskb;
145
	netdev_features_t features;
146
	int ret = 0;
147

148
	/* Please see corresponding comment in ip_finish_output_gso
149
	 * describing the cases where GSO segment length exceeds the
150
	 * egress MTU.
151
	 */
152
	features = netif_skb_features(skb);
153
	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154
	if (IS_ERR_OR_NULL(segs)) {
155
		kfree_skb(skb);
156
		return -ENOMEM;
157
	}
158

159
	consume_skb(skb);
160

161
	skb_list_walk_safe(segs, segs, nskb) {
162
		int err;
163

164
		skb_mark_not_on_list(segs);
165
		/* Last GSO segment can be smaller than gso_size (and MTU).
166
		 * Adding a fragment header would produce an "atomic fragment",
167
		 * which is considered harmful (RFC-8021). Avoid that.
168
		 */
169
		err = segs->len > mtu ?
170
			ip6_fragment(net, sk, segs, ip6_finish_output2) :
171
			ip6_finish_output2(net, sk, segs);
172
		if (err && ret == 0)
173
			ret = err;
174
	}
175

176
	return ret;
177
}
178

179
static int ip6_finish_output_gso(struct net *net, struct sock *sk,
180
				 struct sk_buff *skb, unsigned int mtu)
181
{
182
	if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
183
	    !skb_gso_validate_network_len(skb, mtu))
184
		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
185

186
	return ip6_finish_output2(net, sk, skb);
187
}
188

189
static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
190
{
191
	unsigned int mtu;
192

193
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
194
	/* Policy lookup after SNAT yielded a new policy */
195
	if (skb_dst(skb)->xfrm) {
196
		IP6CB(skb)->flags |= IP6SKB_REROUTED;
197
		return dst_output(net, sk, skb);
198
	}
199
#endif
200

201
	mtu = ip6_skb_dst_mtu(skb);
202
	if (skb_is_gso(skb))
203
		return ip6_finish_output_gso(net, sk, skb, mtu);
204

205
	if (skb->len > mtu ||
206
	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
207
		return ip6_fragment(net, sk, skb, ip6_finish_output2);
208

209
	return ip6_finish_output2(net, sk, skb);
210
}
211

212
static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
213
{
214
	int ret;
215

216
	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
217
	switch (ret) {
218
	case NET_XMIT_SUCCESS:
219
	case NET_XMIT_CN:
220
		return __ip6_finish_output(net, sk, skb) ? : ret;
221
	default:
222
		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
223
		return ret;
224
	}
225
}
226

227
int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
228
{
229
	struct dst_entry *dst = skb_dst(skb);
230
	struct net_device *dev, *indev = skb->dev;
231
	struct inet6_dev *idev;
232
	int ret;
233

234
	skb->protocol = htons(ETH_P_IPV6);
235
	rcu_read_lock();
236
	dev = dst_dev_rcu(dst);
237
	idev = ip6_dst_idev(dst);
238
	skb->dev = dev;
239

240
	if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
241
		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
242
		rcu_read_unlock();
243
		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
244
		return 0;
245
	}
246

247
	ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
248
			   net, sk, skb, indev, dev,
249
			   ip6_finish_output,
250
			   !(IP6CB(skb)->flags & IP6SKB_REROUTED));
251
	rcu_read_unlock();
252
	return ret;
253
}
254
EXPORT_SYMBOL(ip6_output);
255

256
bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
257
{
258
	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
259
		return ip6_default_np_autolabel(net);
260
	return inet6_test_bit(AUTOFLOWLABEL, sk);
261
}
262

263
/*
264
 * xmit an sk_buff (used by TCP and SCTP)
265
 * Note : socket lock is not held for SYNACK packets, but might be modified
266
 * by calls to skb_set_owner_w() and ipv6_local_error(),
267
 * which are using proper atomic operations or spinlocks.
268
 */
269
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
270
	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
271
{
272
	const struct ipv6_pinfo *np = inet6_sk(sk);
273
	struct in6_addr *first_hop = &fl6->daddr;
274
	struct dst_entry *dst = skb_dst(skb);
275
	struct inet6_dev *idev = ip6_dst_idev(dst);
276
	struct hop_jumbo_hdr *hop_jumbo;
277
	int hoplen = sizeof(*hop_jumbo);
278
	struct net *net = sock_net(sk);
279
	unsigned int head_room;
280
	struct net_device *dev;
281
	struct ipv6hdr *hdr;
282
	u8  proto = fl6->flowi6_proto;
283
	int seg_len = skb->len;
284
	int ret, hlimit = -1;
285
	u32 mtu;
286

287
	rcu_read_lock();
288

289
	dev = dst_dev_rcu(dst);
290
	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
291
	if (opt)
292
		head_room += opt->opt_nflen + opt->opt_flen;
293

294
	if (unlikely(head_room > skb_headroom(skb))) {
295
		/* idev stays alive while we hold rcu_read_lock(). */
296
		skb = skb_expand_head(skb, head_room);
297
		if (!skb) {
298
			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
299
			ret = -ENOBUFS;
300
			goto unlock;
301
		}
302
	}
303

304
	if (opt) {
305
		seg_len += opt->opt_nflen + opt->opt_flen;
306

307
		if (opt->opt_flen)
308
			ipv6_push_frag_opts(skb, opt, &proto);
309

310
		if (opt->opt_nflen)
311
			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
312
					     &fl6->saddr);
313
	}
314

315
	if (unlikely(seg_len > IPV6_MAXPLEN)) {
316
		hop_jumbo = skb_push(skb, hoplen);
317

318
		hop_jumbo->nexthdr = proto;
319
		hop_jumbo->hdrlen = 0;
320
		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
321
		hop_jumbo->tlv_len = 4;
322
		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
323

324
		proto = IPPROTO_HOPOPTS;
325
		seg_len = 0;
326
		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
327
	}
328

329
	skb_push(skb, sizeof(struct ipv6hdr));
330
	skb_reset_network_header(skb);
331
	hdr = ipv6_hdr(skb);
332

333
	/*
334
	 *	Fill in the IPv6 header
335
	 */
336
	if (np)
337
		hlimit = READ_ONCE(np->hop_limit);
338
	if (hlimit < 0)
339
		hlimit = ip6_dst_hoplimit(dst);
340

341
	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
342
				ip6_autoflowlabel(net, sk), fl6));
343

344
	hdr->payload_len = htons(seg_len);
345
	hdr->nexthdr = proto;
346
	hdr->hop_limit = hlimit;
347

348
	hdr->saddr = fl6->saddr;
349
	hdr->daddr = *first_hop;
350

351
	skb->protocol = htons(ETH_P_IPV6);
352
	skb->priority = priority;
353
	skb->mark = mark;
354

355
	mtu = dst_mtu(dst);
356
	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
357
		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
358

359
		/* if egress device is enslaved to an L3 master device pass the
360
		 * skb to its handler for processing
361
		 */
362
		skb = l3mdev_ip6_out((struct sock *)sk, skb);
363
		if (unlikely(!skb)) {
364
			ret = 0;
365
			goto unlock;
366
		}
367

368
		/* hooks should never assume socket lock is held.
369
		 * we promote our socket to non const
370
		 */
371
		ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
372
			      net, (struct sock *)sk, skb, NULL, dev,
373
			      dst_output);
374
		goto unlock;
375
	}
376

377
	ret = -EMSGSIZE;
378
	skb->dev = dev;
379
	/* ipv6_local_error() does not require socket lock,
380
	 * we promote our socket to non const
381
	 */
382
	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
383

384
	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
385
	kfree_skb(skb);
386
unlock:
387
	rcu_read_unlock();
388
	return ret;
389
}
390
EXPORT_SYMBOL(ip6_xmit);
391

392
static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
393
{
394
	struct ip6_ra_chain *ra;
395
	struct sock *last = NULL;
396

397
	read_lock(&ip6_ra_lock);
398
	for (ra = ip6_ra_chain; ra; ra = ra->next) {
399
		struct sock *sk = ra->sk;
400
		if (sk && ra->sel == sel &&
401
		    (!sk->sk_bound_dev_if ||
402
		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
403

404
			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
405
			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
406
				continue;
407
			}
408
			if (last) {
409
				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
410
				if (skb2)
411
					rawv6_rcv(last, skb2);
412
			}
413
			last = sk;
414
		}
415
	}
416

417
	if (last) {
418
		rawv6_rcv(last, skb);
419
		read_unlock(&ip6_ra_lock);
420
		return 1;
421
	}
422
	read_unlock(&ip6_ra_lock);
423
	return 0;
424
}
425

426
static int ip6_forward_proxy_check(struct sk_buff *skb)
427
{
428
	struct ipv6hdr *hdr = ipv6_hdr(skb);
429
	u8 nexthdr = hdr->nexthdr;
430
	__be16 frag_off;
431
	int offset;
432

433
	if (ipv6_ext_hdr(nexthdr)) {
434
		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
435
		if (offset < 0)
436
			return 0;
437
	} else
438
		offset = sizeof(struct ipv6hdr);
439

440
	if (nexthdr == IPPROTO_ICMPV6) {
441
		struct icmp6hdr *icmp6;
442

443
		if (!pskb_may_pull(skb, (skb_network_header(skb) +
444
					 offset + 1 - skb->data)))
445
			return 0;
446

447
		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
448

449
		switch (icmp6->icmp6_type) {
450
		case NDISC_ROUTER_SOLICITATION:
451
		case NDISC_ROUTER_ADVERTISEMENT:
452
		case NDISC_NEIGHBOUR_SOLICITATION:
453
		case NDISC_NEIGHBOUR_ADVERTISEMENT:
454
		case NDISC_REDIRECT:
455
			/* For reaction involving unicast neighbor discovery
456
			 * message destined to the proxied address, pass it to
457
			 * input function.
458
			 */
459
			return 1;
460
		default:
461
			break;
462
		}
463
	}
464

465
	/*
466
	 * The proxying router can't forward traffic sent to a link-local
467
	 * address, so signal the sender and discard the packet. This
468
	 * behavior is clarified by the MIPv6 specification.
469
	 */
470
	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
471
		dst_link_failure(skb);
472
		return -1;
473
	}
474

475
	return 0;
476
}
477

478
static inline int ip6_forward_finish(struct net *net, struct sock *sk,
479
				     struct sk_buff *skb)
480
{
481
#ifdef CONFIG_NET_SWITCHDEV
482
	if (skb->offload_l3_fwd_mark) {
483
		consume_skb(skb);
484
		return 0;
485
	}
486
#endif
487

488
	skb_clear_tstamp(skb);
489
	return dst_output(net, sk, skb);
490
}
491

492
static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
493
{
494
	if (skb->len <= mtu)
495
		return false;
496

497
	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
498
	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
499
		return true;
500

501
	if (skb->ignore_df)
502
		return false;
503

504
	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
505
		return false;
506

507
	return true;
508
}
509

510
int ip6_forward(struct sk_buff *skb)
511
{
512
	struct dst_entry *dst = skb_dst(skb);
513
	struct ipv6hdr *hdr = ipv6_hdr(skb);
514
	struct inet6_skb_parm *opt = IP6CB(skb);
515
	struct net *net = dev_net(dst_dev(dst));
516
	struct net_device *dev;
517
	struct inet6_dev *idev;
518
	SKB_DR(reason);
519
	u32 mtu;
520

521
	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
522
	if (!READ_ONCE(net->ipv6.devconf_all->forwarding) &&
523
	    (!idev || !READ_ONCE(idev->cnf.force_forwarding)))
524
		goto error;
525

526
	if (skb->pkt_type != PACKET_HOST)
527
		goto drop;
528

529
	if (unlikely(skb->sk))
530
		goto drop;
531

532
	if (skb_warn_if_lro(skb))
533
		goto drop;
534

535
	if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
536
	    (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
537
	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
538
		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
539
		goto drop;
540
	}
541

542
	skb_forward_csum(skb);
543

544
	/*
545
	 *	We DO NOT make any processing on
546
	 *	RA packets, pushing them to user level AS IS
547
	 *	without ane WARRANTY that application will be able
548
	 *	to interpret them. The reason is that we
549
	 *	cannot make anything clever here.
550
	 *
551
	 *	We are not end-node, so that if packet contains
552
	 *	AH/ESP, we cannot make anything.
553
	 *	Defragmentation also would be mistake, RA packets
554
	 *	cannot be fragmented, because there is no warranty
555
	 *	that different fragments will go along one path. --ANK
556
	 */
557
	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
558
		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
559
			return 0;
560
	}
561

562
	/*
563
	 *	check and decrement ttl
564
	 */
565
	if (hdr->hop_limit <= 1) {
566
		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
567
		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
568

569
		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
570
		return -ETIMEDOUT;
571
	}
572

573
	/* XXX: idev->cnf.proxy_ndp? */
574
	if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
575
	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {
576
		int proxied = ip6_forward_proxy_check(skb);
577
		if (proxied > 0) {
578
			/* It's tempting to decrease the hop limit
579
			 * here by 1, as we do at the end of the
580
			 * function too.
581
			 *
582
			 * But that would be incorrect, as proxying is
583
			 * not forwarding.  The ip6_input function
584
			 * will handle this packet locally, and it
585
			 * depends on the hop limit being unchanged.
586
			 *
587
			 * One example is the NDP hop limit, that
588
			 * always has to stay 255, but other would be
589
			 * similar checks around RA packets, where the
590
			 * user can even change the desired limit.
591
			 */
592
			return ip6_input(skb);
593
		} else if (proxied < 0) {
594
			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
595
			goto drop;
596
		}
597
	}
598

599
	if (!xfrm6_route_forward(skb)) {
600
		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
601
		SKB_DR_SET(reason, XFRM_POLICY);
602
		goto drop;
603
	}
604
	dst = skb_dst(skb);
605
	dev = dst_dev(dst);
606
	/* IPv6 specs say nothing about it, but it is clear that we cannot
607
	   send redirects to source routed frames.
608
	   We don't send redirects to frames decapsulated from IPsec.
609
	 */
610
	if (IP6CB(skb)->iif == dev->ifindex &&
611
	    opt->srcrt == 0 && !skb_sec_path(skb)) {
612
		struct in6_addr *target = NULL;
613
		struct inet_peer *peer;
614
		struct rt6_info *rt;
615

616
		/*
617
		 *	incoming and outgoing devices are the same
618
		 *	send a redirect.
619
		 */
620

621
		rt = dst_rt6_info(dst);
622
		if (rt->rt6i_flags & RTF_GATEWAY)
623
			target = &rt->rt6i_gateway;
624
		else
625
			target = &hdr->daddr;
626

627
		rcu_read_lock();
628
		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr);
629

630
		/* Limit redirects both by destination (here)
631
		   and by source (inside ndisc_send_redirect)
632
		 */
633
		if (inet_peer_xrlim_allow(peer, 1*HZ))
634
			ndisc_send_redirect(skb, target);
635
		rcu_read_unlock();
636
	} else {
637
		int addrtype = ipv6_addr_type(&hdr->saddr);
638

639
		/* This check is security critical. */
640
		if (addrtype == IPV6_ADDR_ANY ||
641
		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
642
			goto error;
643
		if (addrtype & IPV6_ADDR_LINKLOCAL) {
644
			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
645
				    ICMPV6_NOT_NEIGHBOUR, 0);
646
			goto error;
647
		}
648
	}
649

650
	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
651

652
	mtu = ip6_dst_mtu_maybe_forward(dst, true);
653
	if (mtu < IPV6_MIN_MTU)
654
		mtu = IPV6_MIN_MTU;
655

656
	if (ip6_pkt_too_big(skb, mtu)) {
657
		/* Again, force OUTPUT device used as source address */
658
		skb->dev = dev;
659
		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
660
		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
661
		__IP6_INC_STATS(net, ip6_dst_idev(dst),
662
				IPSTATS_MIB_FRAGFAILS);
663
		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
664
		return -EMSGSIZE;
665
	}
666

667
	if (skb_cow(skb, dev->hard_header_len)) {
668
		__IP6_INC_STATS(net, ip6_dst_idev(dst),
669
				IPSTATS_MIB_OUTDISCARDS);
670
		goto drop;
671
	}
672

673
	hdr = ipv6_hdr(skb);
674

675
	/* Mangling hops number delayed to point after skb COW */
676

677
	hdr->hop_limit--;
678

679
	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
680
		       net, NULL, skb, skb->dev, dev,
681
		       ip6_forward_finish);
682

683
error:
684
	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
685
	SKB_DR_SET(reason, IP_INADDRERRORS);
686
drop:
687
	kfree_skb_reason(skb, reason);
688
	return -EINVAL;
689
}
690

691
static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
692
{
693
	to->pkt_type = from->pkt_type;
694
	to->priority = from->priority;
695
	to->protocol = from->protocol;
696
	skb_dst_drop(to);
697
	skb_dst_set(to, dst_clone(skb_dst(from)));
698
	to->dev = from->dev;
699
	to->mark = from->mark;
700

701
	skb_copy_hash(to, from);
702

703
#ifdef CONFIG_NET_SCHED
704
	to->tc_index = from->tc_index;
705
#endif
706
	nf_copy(to, from);
707
	skb_ext_copy(to, from);
708
	skb_copy_secmark(to, from);
709
}
710

711
int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
712
		      u8 nexthdr, __be32 frag_id,
713
		      struct ip6_fraglist_iter *iter)
714
{
715
	unsigned int first_len;
716
	struct frag_hdr *fh;
717

718
	/* BUILD HEADER */
719
	*prevhdr = NEXTHDR_FRAGMENT;
720
	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
721
	if (!iter->tmp_hdr)
722
		return -ENOMEM;
723

724
	iter->frag = skb_shinfo(skb)->frag_list;
725
	skb_frag_list_init(skb);
726

727
	iter->offset = 0;
728
	iter->hlen = hlen;
729
	iter->frag_id = frag_id;
730
	iter->nexthdr = nexthdr;
731

732
	__skb_pull(skb, hlen);
733
	fh = __skb_push(skb, sizeof(struct frag_hdr));
734
	__skb_push(skb, hlen);
735
	skb_reset_network_header(skb);
736
	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
737

738
	fh->nexthdr = nexthdr;
739
	fh->reserved = 0;
740
	fh->frag_off = htons(IP6_MF);
741
	fh->identification = frag_id;
742

743
	first_len = skb_pagelen(skb);
744
	skb->data_len = first_len - skb_headlen(skb);
745
	skb->len = first_len;
746
	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
747

748
	return 0;
749
}
750
EXPORT_SYMBOL(ip6_fraglist_init);
751

752
void ip6_fraglist_prepare(struct sk_buff *skb,
753
			  struct ip6_fraglist_iter *iter)
754
{
755
	struct sk_buff *frag = iter->frag;
756
	unsigned int hlen = iter->hlen;
757
	struct frag_hdr *fh;
758

759
	frag->ip_summed = CHECKSUM_NONE;
760
	skb_reset_transport_header(frag);
761
	fh = __skb_push(frag, sizeof(struct frag_hdr));
762
	__skb_push(frag, hlen);
763
	skb_reset_network_header(frag);
764
	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
765
	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
766
	fh->nexthdr = iter->nexthdr;
767
	fh->reserved = 0;
768
	fh->frag_off = htons(iter->offset);
769
	if (frag->next)
770
		fh->frag_off |= htons(IP6_MF);
771
	fh->identification = iter->frag_id;
772
	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
773
	ip6_copy_metadata(frag, skb);
774
}
775
EXPORT_SYMBOL(ip6_fraglist_prepare);
776

777
void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
778
		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
779
		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
780
{
781
	state->prevhdr = prevhdr;
782
	state->nexthdr = nexthdr;
783
	state->frag_id = frag_id;
784

785
	state->hlen = hlen;
786
	state->mtu = mtu;
787

788
	state->left = skb->len - hlen;	/* Space per frame */
789
	state->ptr = hlen;		/* Where to start from */
790

791
	state->hroom = hdr_room;
792
	state->troom = needed_tailroom;
793

794
	state->offset = 0;
795
}
796
EXPORT_SYMBOL(ip6_frag_init);
797

798
struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
799
{
800
	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
801
	struct sk_buff *frag;
802
	struct frag_hdr *fh;
803
	unsigned int len;
804

805
	len = state->left;
806
	/* IF: it doesn't fit, use 'mtu' - the data space left */
807
	if (len > state->mtu)
808
		len = state->mtu;
809
	/* IF: we are not sending up to and including the packet end
810
	   then align the next start on an eight byte boundary */
811
	if (len < state->left)
812
		len &= ~7;
813

814
	/* Allocate buffer */
815
	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
816
			 state->hroom + state->troom, GFP_ATOMIC);
817
	if (!frag)
818
		return ERR_PTR(-ENOMEM);
819

820
	/*
821
	 *	Set up data on packet
822
	 */
823

824
	ip6_copy_metadata(frag, skb);
825
	skb_reserve(frag, state->hroom);
826
	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
827
	skb_reset_network_header(frag);
828
	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
829
	frag->transport_header = (frag->network_header + state->hlen +
830
				  sizeof(struct frag_hdr));
831

832
	/*
833
	 *	Charge the memory for the fragment to any owner
834
	 *	it might possess
835
	 */
836
	if (skb->sk)
837
		skb_set_owner_w(frag, skb->sk);
838

839
	/*
840
	 *	Copy the packet header into the new buffer.
841
	 */
842
	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
843

844
	fragnexthdr_offset = skb_network_header(frag);
845
	fragnexthdr_offset += prevhdr - skb_network_header(skb);
846
	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
847

848
	/*
849
	 *	Build fragment header.
850
	 */
851
	fh->nexthdr = state->nexthdr;
852
	fh->reserved = 0;
853
	fh->identification = state->frag_id;
854

855
	/*
856
	 *	Copy a block of the IP datagram.
857
	 */
858
	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
859
			     len));
860
	state->left -= len;
861

862
	fh->frag_off = htons(state->offset);
863
	if (state->left > 0)
864
		fh->frag_off |= htons(IP6_MF);
865
	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
866

867
	state->ptr += len;
868
	state->offset += len;
869

870
	return frag;
871
}
872
EXPORT_SYMBOL(ip6_frag_next);
873

874
int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
875
		 int (*output)(struct net *, struct sock *, struct sk_buff *))
876
{
877
	struct sk_buff *frag;
878
	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
879
	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
880
				inet6_sk(skb->sk) : NULL;
881
	u8 tstamp_type = skb->tstamp_type;
882
	struct ip6_frag_state state;
883
	unsigned int mtu, hlen, nexthdr_offset;
884
	ktime_t tstamp = skb->tstamp;
885
	int hroom, err = 0;
886
	__be32 frag_id;
887
	u8 *prevhdr, nexthdr = 0;
888

889
	err = ip6_find_1stfragopt(skb, &prevhdr);
890
	if (err < 0)
891
		goto fail;
892
	hlen = err;
893
	nexthdr = *prevhdr;
894
	nexthdr_offset = prevhdr - skb_network_header(skb);
895

896
	mtu = ip6_skb_dst_mtu(skb);
897

898
	/* We must not fragment if the socket is set to force MTU discovery
899
	 * or if the skb it not generated by a local socket.
900
	 */
901
	if (unlikely(!skb->ignore_df && skb->len > mtu))
902
		goto fail_toobig;
903

904
	if (IP6CB(skb)->frag_max_size) {
905
		if (IP6CB(skb)->frag_max_size > mtu)
906
			goto fail_toobig;
907

908
		/* don't send fragments larger than what we received */
909
		mtu = IP6CB(skb)->frag_max_size;
910
		if (mtu < IPV6_MIN_MTU)
911
			mtu = IPV6_MIN_MTU;
912
	}
913

914
	if (np) {
915
		u32 frag_size = READ_ONCE(np->frag_size);
916

917
		if (frag_size && frag_size < mtu)
918
			mtu = frag_size;
919
	}
920
	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
921
		goto fail_toobig;
922
	mtu -= hlen + sizeof(struct frag_hdr);
923

924
	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
925
				    &ipv6_hdr(skb)->saddr);
926

927
	if (skb->ip_summed == CHECKSUM_PARTIAL &&
928
	    (err = skb_checksum_help(skb)))
929
		goto fail;
930

931
	prevhdr = skb_network_header(skb) + nexthdr_offset;
932
	hroom = LL_RESERVED_SPACE(rt->dst.dev);
933
	if (skb_has_frag_list(skb)) {
934
		unsigned int first_len = skb_pagelen(skb);
935
		struct ip6_fraglist_iter iter;
936
		struct sk_buff *frag2;
937

938
		if (first_len - hlen > mtu ||
939
		    ((first_len - hlen) & 7) ||
940
		    skb_cloned(skb) ||
941
		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
942
			goto slow_path;
943

944
		skb_walk_frags(skb, frag) {
945
			/* Correct geometry. */
946
			if (frag->len > mtu ||
947
			    ((frag->len & 7) && frag->next) ||
948
			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
949
				goto slow_path_clean;
950

951
			/* Partially cloned skb? */
952
			if (skb_shared(frag))
953
				goto slow_path_clean;
954

955
			BUG_ON(frag->sk);
956
			if (skb->sk) {
957
				frag->sk = skb->sk;
958
				frag->destructor = sock_wfree;
959
			}
960
			skb->truesize -= frag->truesize;
961
		}
962

963
		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
964
					&iter);
965
		if (err < 0)
966
			goto fail;
967

968
		/* We prevent @rt from being freed. */
969
		rcu_read_lock();
970

971
		for (;;) {
972
			/* Prepare header of the next frame,
973
			 * before previous one went down. */
974
			if (iter.frag)
975
				ip6_fraglist_prepare(skb, &iter);
976

977
			skb_set_delivery_time(skb, tstamp, tstamp_type);
978
			err = output(net, sk, skb);
979
			if (!err)
980
				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
981
					      IPSTATS_MIB_FRAGCREATES);
982

983
			if (err || !iter.frag)
984
				break;
985

986
			skb = ip6_fraglist_next(&iter);
987
		}
988

989
		kfree(iter.tmp_hdr);
990

991
		if (err == 0) {
992
			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
993
				      IPSTATS_MIB_FRAGOKS);
994
			rcu_read_unlock();
995
			return 0;
996
		}
997

998
		kfree_skb_list(iter.frag);
999

1000
		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
1001
			      IPSTATS_MIB_FRAGFAILS);
1002
		rcu_read_unlock();
1003
		return err;
1004

1005
slow_path_clean:
1006
		skb_walk_frags(skb, frag2) {
1007
			if (frag2 == frag)
1008
				break;
1009
			frag2->sk = NULL;
1010
			frag2->destructor = NULL;
1011
			skb->truesize += frag2->truesize;
1012
		}
1013
	}
1014

1015
slow_path:
1016
	/*
1017
	 *	Fragment the datagram.
1018
	 */
1019

1020
	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1021
		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1022
		      &state);
1023

1024
	/*
1025
	 *	Keep copying data until we run out.
1026
	 */
1027

1028
	while (state.left > 0) {
1029
		frag = ip6_frag_next(skb, &state);
1030
		if (IS_ERR(frag)) {
1031
			err = PTR_ERR(frag);
1032
			goto fail;
1033
		}
1034

1035
		/*
1036
		 *	Put this fragment into the sending queue.
1037
		 */
1038
		skb_set_delivery_time(frag, tstamp, tstamp_type);
1039
		err = output(net, sk, frag);
1040
		if (err)
1041
			goto fail;
1042

1043
		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1044
			      IPSTATS_MIB_FRAGCREATES);
1045
	}
1046
	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1047
		      IPSTATS_MIB_FRAGOKS);
1048
	consume_skb(skb);
1049
	return err;
1050

1051
fail_toobig:
1052
	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1053
	err = -EMSGSIZE;
1054

1055
fail:
1056
	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1057
		      IPSTATS_MIB_FRAGFAILS);
1058
	kfree_skb(skb);
1059
	return err;
1060
}
1061

1062
static inline int ip6_rt_check(const struct rt6key *rt_key,
1063
			       const struct in6_addr *fl_addr,
1064
			       const struct in6_addr *addr_cache)
1065
{
1066
	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1067
		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1068
}
1069

1070
static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1071
					  struct dst_entry *dst,
1072
					  const struct flowi6 *fl6)
1073
{
1074
	struct ipv6_pinfo *np = inet6_sk(sk);
1075
	struct rt6_info *rt;
1076

1077
	if (!dst)
1078
		goto out;
1079

1080
	if (dst->ops->family != AF_INET6) {
1081
		dst_release(dst);
1082
		return NULL;
1083
	}
1084

1085
	rt = dst_rt6_info(dst);
1086
	/* Yes, checking route validity in not connected
1087
	 * case is not very simple. Take into account,
1088
	 * that we do not support routing by source, TOS,
1089
	 * and MSG_DONTROUTE		--ANK (980726)
1090
	 *
1091
	 * 1. ip6_rt_check(): If route was host route,
1092
	 *    check that cached destination is current.
1093
	 *    If it is network route, we still may
1094
	 *    check its validity using saved pointer
1095
	 *    to the last used address: daddr_cache.
1096
	 *    We do not want to save whole address now,
1097
	 *    (because main consumer of this service
1098
	 *    is tcp, which has not this problem),
1099
	 *    so that the last trick works only on connected
1100
	 *    sockets.
1101
	 * 2. oif also should be the same.
1102
	 */
1103
	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr,
1104
			 np->daddr_cache ? &sk->sk_v6_daddr : NULL) ||
1105
#ifdef CONFIG_IPV6_SUBTREES
1106
	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr,
1107
			 np->saddr_cache ? &np->saddr : NULL) ||
1108
#endif
1109
	   (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
1110
		dst_release(dst);
1111
		dst = NULL;
1112
	}
1113

1114
out:
1115
	return dst;
1116
}
1117

1118
static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1119
			       struct dst_entry **dst, struct flowi6 *fl6)
1120
{
1121
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1122
	struct neighbour *n;
1123
	struct rt6_info *rt;
1124
#endif
1125
	int err;
1126
	int flags = 0;
1127

1128
	/* The correct way to handle this would be to do
1129
	 * ip6_route_get_saddr, and then ip6_route_output; however,
1130
	 * the route-specific preferred source forces the
1131
	 * ip6_route_output call _before_ ip6_route_get_saddr.
1132
	 *
1133
	 * In source specific routing (no src=any default route),
1134
	 * ip6_route_output will fail given src=any saddr, though, so
1135
	 * that's why we try it again later.
1136
	 */
1137
	if (ipv6_addr_any(&fl6->saddr)) {
1138
		struct fib6_info *from;
1139
		struct rt6_info *rt;
1140

1141
		*dst = ip6_route_output(net, sk, fl6);
1142
		rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1143

1144
		rcu_read_lock();
1145
		from = rt ? rcu_dereference(rt->from) : NULL;
1146
		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1147
					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1148
					  fl6->flowi6_l3mdev,
1149
					  &fl6->saddr);
1150
		rcu_read_unlock();
1151

1152
		if (err)
1153
			goto out_err_release;
1154

1155
		/* If we had an erroneous initial result, pretend it
1156
		 * never existed and let the SA-enabled version take
1157
		 * over.
1158
		 */
1159
		if ((*dst)->error) {
1160
			dst_release(*dst);
1161
			*dst = NULL;
1162
		}
1163

1164
		if (fl6->flowi6_oif)
1165
			flags |= RT6_LOOKUP_F_IFACE;
1166
	}
1167

1168
	if (!*dst)
1169
		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1170

1171
	err = (*dst)->error;
1172
	if (err)
1173
		goto out_err_release;
1174

1175
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1176
	/*
1177
	 * Here if the dst entry we've looked up
1178
	 * has a neighbour entry that is in the INCOMPLETE
1179
	 * state and the src address from the flow is
1180
	 * marked as OPTIMISTIC, we release the found
1181
	 * dst entry and replace it instead with the
1182
	 * dst entry of the nexthop router
1183
	 */
1184
	rt = dst_rt6_info(*dst);
1185
	rcu_read_lock();
1186
	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1187
				      rt6_nexthop(rt, &fl6->daddr));
1188
	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1189
	rcu_read_unlock();
1190

1191
	if (err) {
1192
		struct inet6_ifaddr *ifp;
1193
		struct flowi6 fl_gw6;
1194
		int redirect;
1195

1196
		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1197
				      (*dst)->dev, 1);
1198

1199
		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1200
		if (ifp)
1201
			in6_ifa_put(ifp);
1202

1203
		if (redirect) {
1204
			/*
1205
			 * We need to get the dst entry for the
1206
			 * default router instead
1207
			 */
1208
			dst_release(*dst);
1209
			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1210
			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1211
			*dst = ip6_route_output(net, sk, &fl_gw6);
1212
			err = (*dst)->error;
1213
			if (err)
1214
				goto out_err_release;
1215
		}
1216
	}
1217
#endif
1218
	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1219
	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1220
		err = -EAFNOSUPPORT;
1221
		goto out_err_release;
1222
	}
1223

1224
	return 0;
1225

1226
out_err_release:
1227
	dst_release(*dst);
1228
	*dst = NULL;
1229

1230
	if (err == -ENETUNREACH)
1231
		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1232
	return err;
1233
}
1234

1235
/**
1236
 *	ip6_dst_lookup - perform route lookup on flow
1237
 *	@net: Network namespace to perform lookup in
1238
 *	@sk: socket which provides route info
1239
 *	@dst: pointer to dst_entry * for result
1240
 *	@fl6: flow to lookup
1241
 *
1242
 *	This function performs a route lookup on the given flow.
1243
 *
1244
 *	It returns zero on success, or a standard errno code on error.
1245
 */
1246
int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1247
		   struct flowi6 *fl6)
1248
{
1249
	*dst = NULL;
1250
	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1251
}
1252
EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1253

1254
/**
1255
 *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1256
 *	@net: Network namespace to perform lookup in
1257
 *	@sk: socket which provides route info
1258
 *	@fl6: flow to lookup
1259
 *	@final_dst: final destination address for ipsec lookup
1260
 *
1261
 *	This function performs a route lookup on the given flow.
1262
 *
1263
 *	It returns a valid dst pointer on success, or a pointer encoded
1264
 *	error code.
1265
 */
1266
struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1267
				      const struct in6_addr *final_dst)
1268
{
1269
	struct dst_entry *dst = NULL;
1270
	int err;
1271

1272
	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1273
	if (err)
1274
		return ERR_PTR(err);
1275
	if (final_dst)
1276
		fl6->daddr = *final_dst;
1277

1278
	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1279
}
1280
EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1281

1282
/**
1283
 *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1284
 *	@sk: socket which provides the dst cache and route info
1285
 *	@fl6: flow to lookup
1286
 *	@final_dst: final destination address for ipsec lookup
1287
 *	@connected: whether @sk is connected or not
1288
 *
1289
 *	This function performs a route lookup on the given flow with the
1290
 *	possibility of using the cached route in the socket if it is valid.
1291
 *	It will take the socket dst lock when operating on the dst cache.
1292
 *	As a result, this function can only be used in process context.
1293
 *
1294
 *	In addition, for a connected socket, cache the dst in the socket
1295
 *	if the current cache is not valid.
1296
 *
1297
 *	It returns a valid dst pointer on success, or a pointer encoded
1298
 *	error code.
1299
 */
1300
struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1301
					 const struct in6_addr *final_dst,
1302
					 bool connected)
1303
{
1304
	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1305

1306
	dst = ip6_sk_dst_check(sk, dst, fl6);
1307
	if (dst)
1308
		return dst;
1309

1310
	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1311
	if (connected && !IS_ERR(dst))
1312
		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1313

1314
	return dst;
1315
}
1316
EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1317

1318
static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1319
					       gfp_t gfp)
1320
{
1321
	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1322
}
1323

1324
static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1325
						gfp_t gfp)
1326
{
1327
	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1328
}
1329

1330
static void ip6_append_data_mtu(unsigned int *mtu,
1331
				int *maxfraglen,
1332
				unsigned int fragheaderlen,
1333
				struct sk_buff *skb,
1334
				struct rt6_info *rt,
1335
				unsigned int orig_mtu)
1336
{
1337
	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1338
		if (!skb) {
1339
			/* first fragment, reserve header_len */
1340
			*mtu = orig_mtu - rt->dst.header_len;
1341

1342
		} else {
1343
			/*
1344
			 * this fragment is not first, the headers
1345
			 * space is regarded as data space.
1346
			 */
1347
			*mtu = orig_mtu;
1348
		}
1349
		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1350
			      + fragheaderlen - sizeof(struct frag_hdr);
1351
	}
1352
}
1353

1354
static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1355
			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1356
			  struct rt6_info *rt)
1357
{
1358
	struct ipv6_pinfo *np = inet6_sk(sk);
1359
	unsigned int mtu, frag_size;
1360
	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1361

1362
	/* callers pass dst together with a reference, set it first so
1363
	 * ip6_cork_release() can put it down even in case of an error.
1364
	 */
1365
	cork->base.dst = &rt->dst;
1366

1367
	/*
1368
	 * setup for corking
1369
	 */
1370
	if (opt) {
1371
		if (WARN_ON(v6_cork->opt))
1372
			return -EINVAL;
1373

1374
		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1375
		if (unlikely(!nopt))
1376
			return -ENOBUFS;
1377

1378
		nopt->tot_len = sizeof(*opt);
1379
		nopt->opt_flen = opt->opt_flen;
1380
		nopt->opt_nflen = opt->opt_nflen;
1381

1382
		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1383
		if (opt->dst0opt && !nopt->dst0opt)
1384
			return -ENOBUFS;
1385

1386
		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1387
		if (opt->dst1opt && !nopt->dst1opt)
1388
			return -ENOBUFS;
1389

1390
		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1391
		if (opt->hopopt && !nopt->hopopt)
1392
			return -ENOBUFS;
1393

1394
		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1395
		if (opt->srcrt && !nopt->srcrt)
1396
			return -ENOBUFS;
1397

1398
		/* need source address above miyazawa*/
1399
	}
1400
	v6_cork->hop_limit = ipc6->hlimit;
1401
	v6_cork->tclass = ipc6->tclass;
1402
	v6_cork->dontfrag = ipc6->dontfrag;
1403
	if (rt->dst.flags & DST_XFRM_TUNNEL)
1404
		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1405
		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1406
	else
1407
		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1408
			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1409

1410
	frag_size = READ_ONCE(np->frag_size);
1411
	if (frag_size && frag_size < mtu)
1412
		mtu = frag_size;
1413

1414
	cork->base.fragsize = mtu;
1415
	cork->base.gso_size = ipc6->gso_size;
1416
	cork->base.tx_flags = 0;
1417
	cork->base.mark = ipc6->sockc.mark;
1418
	cork->base.priority = ipc6->sockc.priority;
1419
	sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags);
1420
	if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
1421
		cork->base.flags |= IPCORK_TS_OPT_ID;
1422
		cork->base.ts_opt_id = ipc6->sockc.ts_opt_id;
1423
	}
1424
	cork->base.length = 0;
1425
	cork->base.transmit_time = ipc6->sockc.transmit_time;
1426

1427
	return 0;
1428
}
1429

1430
static int __ip6_append_data(struct sock *sk,
1431
			     struct sk_buff_head *queue,
1432
			     struct inet_cork_full *cork_full,
1433
			     struct inet6_cork *v6_cork,
1434
			     struct page_frag *pfrag,
1435
			     int getfrag(void *from, char *to, int offset,
1436
					 int len, int odd, struct sk_buff *skb),
1437
			     void *from, size_t length, int transhdrlen,
1438
			     unsigned int flags)
1439
{
1440
	struct sk_buff *skb, *skb_prev = NULL;
1441
	struct inet_cork *cork = &cork_full->base;
1442
	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1443
	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1444
	struct ubuf_info *uarg = NULL;
1445
	int exthdrlen = 0;
1446
	int dst_exthdrlen = 0;
1447
	int hh_len;
1448
	int copy;
1449
	int err;
1450
	int offset = 0;
1451
	bool zc = false;
1452
	u32 tskey = 0;
1453
	struct rt6_info *rt = dst_rt6_info(cork->dst);
1454
	bool paged, hold_tskey = false, extra_uref = false;
1455
	struct ipv6_txoptions *opt = v6_cork->opt;
1456
	int csummode = CHECKSUM_NONE;
1457
	unsigned int maxnonfragsize, headersize;
1458
	unsigned int wmem_alloc_delta = 0;
1459

1460
	skb = skb_peek_tail(queue);
1461
	if (!skb) {
1462
		exthdrlen = opt ? opt->opt_flen : 0;
1463
		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1464
	}
1465

1466
	paged = !!cork->gso_size;
1467
	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1468
	orig_mtu = mtu;
1469

1470
	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1471

1472
	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1473
			(opt ? opt->opt_nflen : 0);
1474

1475
	headersize = sizeof(struct ipv6hdr) +
1476
		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1477
		     rt->rt6i_nfheader_len;
1478

1479
	if (mtu <= fragheaderlen ||
1480
	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1481
		goto emsgsize;
1482

1483
	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1484
		     sizeof(struct frag_hdr);
1485

1486
	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1487
	 * the first fragment
1488
	 */
1489
	if (headersize + transhdrlen > mtu)
1490
		goto emsgsize;
1491

1492
	if (cork->length + length > mtu - headersize && v6_cork->dontfrag &&
1493
	    (sk->sk_protocol == IPPROTO_UDP ||
1494
	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1495
	     sk->sk_protocol == IPPROTO_RAW)) {
1496
		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1497
				sizeof(struct ipv6hdr));
1498
		goto emsgsize;
1499
	}
1500

1501
	if (ip6_sk_ignore_df(sk))
1502
		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1503
	else
1504
		maxnonfragsize = mtu;
1505

1506
	if (cork->length + length > maxnonfragsize - headersize) {
1507
emsgsize:
1508
		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1509
		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1510
		return -EMSGSIZE;
1511
	}
1512

1513
	/* CHECKSUM_PARTIAL only with no extension headers and when
1514
	 * we are not going to fragment
1515
	 */
1516
	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1517
	    headersize == sizeof(struct ipv6hdr) &&
1518
	    length <= mtu - headersize &&
1519
	    (!(flags & MSG_MORE) || cork->gso_size) &&
1520
	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1521
		csummode = CHECKSUM_PARTIAL;
1522

1523
	if ((flags & MSG_ZEROCOPY) && length) {
1524
		struct msghdr *msg = from;
1525

1526
		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1527
			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1528
				return -EINVAL;
1529

1530
			/* Leave uarg NULL if can't zerocopy, callers should
1531
			 * be able to handle it.
1532
			 */
1533
			if ((rt->dst.dev->features & NETIF_F_SG) &&
1534
			    csummode == CHECKSUM_PARTIAL) {
1535
				paged = true;
1536
				zc = true;
1537
				uarg = msg->msg_ubuf;
1538
			}
1539
		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1540
			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
1541
						    false);
1542
			if (!uarg)
1543
				return -ENOBUFS;
1544
			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1545
			if (rt->dst.dev->features & NETIF_F_SG &&
1546
			    csummode == CHECKSUM_PARTIAL) {
1547
				paged = true;
1548
				zc = true;
1549
			} else {
1550
				uarg_to_msgzc(uarg)->zerocopy = 0;
1551
				skb_zcopy_set(skb, uarg, &extra_uref);
1552
			}
1553
		}
1554
	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1555
		if (inet_test_bit(HDRINCL, sk))
1556
			return -EPERM;
1557
		if (rt->dst.dev->features & NETIF_F_SG &&
1558
		    getfrag == ip_generic_getfrag)
1559
			/* We need an empty buffer to attach stuff to */
1560
			paged = true;
1561
		else
1562
			flags &= ~MSG_SPLICE_PAGES;
1563
	}
1564

1565
	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1566
	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
1567
		if (cork->flags & IPCORK_TS_OPT_ID) {
1568
			tskey = cork->ts_opt_id;
1569
		} else {
1570
			tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1571
			hold_tskey = true;
1572
		}
1573
	}
1574

1575
	/*
1576
	 * Let's try using as much space as possible.
1577
	 * Use MTU if total length of the message fits into the MTU.
1578
	 * Otherwise, we need to reserve fragment header and
1579
	 * fragment alignment (= 8-15 octects, in total).
1580
	 *
1581
	 * Note that we may need to "move" the data from the tail
1582
	 * of the buffer to the new fragment when we split
1583
	 * the message.
1584
	 *
1585
	 * FIXME: It may be fragmented into multiple chunks
1586
	 *        at once if non-fragmentable extension headers
1587
	 *        are too large.
1588
	 * --yoshfuji
1589
	 */
1590

1591
	cork->length += length;
1592
	if (!skb)
1593
		goto alloc_new_skb;
1594

1595
	while (length > 0) {
1596
		/* Check if the remaining data fits into current packet. */
1597
		copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1598
		if (copy < length)
1599
			copy = maxfraglen - skb->len;
1600

1601
		if (copy <= 0) {
1602
			char *data;
1603
			unsigned int datalen;
1604
			unsigned int fraglen;
1605
			unsigned int fraggap;
1606
			unsigned int alloclen, alloc_extra;
1607
			unsigned int pagedlen;
1608
alloc_new_skb:
1609
			/* There's no room in the current skb */
1610
			if (skb)
1611
				fraggap = skb->len - maxfraglen;
1612
			else
1613
				fraggap = 0;
1614
			/* update mtu and maxfraglen if necessary */
1615
			if (!skb || !skb_prev)
1616
				ip6_append_data_mtu(&mtu, &maxfraglen,
1617
						    fragheaderlen, skb, rt,
1618
						    orig_mtu);
1619

1620
			skb_prev = skb;
1621

1622
			/*
1623
			 * If remaining data exceeds the mtu,
1624
			 * we know we need more fragment(s).
1625
			 */
1626
			datalen = length + fraggap;
1627

1628
			if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1629
				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1630
			fraglen = datalen + fragheaderlen;
1631
			pagedlen = 0;
1632

1633
			alloc_extra = hh_len;
1634
			alloc_extra += dst_exthdrlen;
1635
			alloc_extra += rt->dst.trailer_len;
1636

1637
			/* We just reserve space for fragment header.
1638
			 * Note: this may be overallocation if the message
1639
			 * (without MSG_MORE) fits into the MTU.
1640
			 */
1641
			alloc_extra += sizeof(struct frag_hdr);
1642

1643
			if ((flags & MSG_MORE) &&
1644
			    !(rt->dst.dev->features&NETIF_F_SG))
1645
				alloclen = mtu;
1646
			else if (!paged &&
1647
				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1648
				  !(rt->dst.dev->features & NETIF_F_SG)))
1649
				alloclen = fraglen;
1650
			else {
1651
				alloclen = fragheaderlen + transhdrlen;
1652
				pagedlen = datalen - transhdrlen;
1653
			}
1654
			alloclen += alloc_extra;
1655

1656
			if (datalen != length + fraggap) {
1657
				/*
1658
				 * this is not the last fragment, the trailer
1659
				 * space is regarded as data space.
1660
				 */
1661
				datalen += rt->dst.trailer_len;
1662
			}
1663

1664
			fraglen = datalen + fragheaderlen;
1665

1666
			copy = datalen - transhdrlen - fraggap - pagedlen;
1667
			/* [!] NOTE: copy may be negative if pagedlen>0
1668
			 * because then the equation may reduces to -fraggap.
1669
			 */
1670
			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1671
				err = -EINVAL;
1672
				goto error;
1673
			}
1674
			if (transhdrlen) {
1675
				skb = sock_alloc_send_skb(sk, alloclen,
1676
						(flags & MSG_DONTWAIT), &err);
1677
			} else {
1678
				skb = NULL;
1679
				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1680
				    2 * sk->sk_sndbuf)
1681
					skb = alloc_skb(alloclen,
1682
							sk->sk_allocation);
1683
				if (unlikely(!skb))
1684
					err = -ENOBUFS;
1685
			}
1686
			if (!skb)
1687
				goto error;
1688
			/*
1689
			 *	Fill in the control structures
1690
			 */
1691
			skb->protocol = htons(ETH_P_IPV6);
1692
			skb->ip_summed = csummode;
1693
			skb->csum = 0;
1694
			/* reserve for fragmentation and ipsec header */
1695
			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1696
				    dst_exthdrlen);
1697

1698
			/*
1699
			 *	Find where to start putting bytes
1700
			 */
1701
			data = skb_put(skb, fraglen - pagedlen);
1702
			skb_set_network_header(skb, exthdrlen);
1703
			data += fragheaderlen;
1704
			skb->transport_header = (skb->network_header +
1705
						 fragheaderlen);
1706
			if (fraggap) {
1707
				skb->csum = skb_copy_and_csum_bits(
1708
					skb_prev, maxfraglen,
1709
					data + transhdrlen, fraggap);
1710
				skb_prev->csum = csum_sub(skb_prev->csum,
1711
							  skb->csum);
1712
				data += fraggap;
1713
				pskb_trim_unique(skb_prev, maxfraglen);
1714
			}
1715
			if (copy > 0 &&
1716
			    INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1717
					   from, data + transhdrlen, offset,
1718
					   copy, fraggap, skb) < 0) {
1719
				err = -EFAULT;
1720
				kfree_skb(skb);
1721
				goto error;
1722
			} else if (flags & MSG_SPLICE_PAGES) {
1723
				copy = 0;
1724
			}
1725

1726
			offset += copy;
1727
			length -= copy + transhdrlen;
1728
			transhdrlen = 0;
1729
			exthdrlen = 0;
1730
			dst_exthdrlen = 0;
1731

1732
			/* Only the initial fragment is time stamped */
1733
			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1734
			cork->tx_flags = 0;
1735
			skb_shinfo(skb)->tskey = tskey;
1736
			tskey = 0;
1737
			skb_zcopy_set(skb, uarg, &extra_uref);
1738

1739
			if ((flags & MSG_CONFIRM) && !skb_prev)
1740
				skb_set_dst_pending_confirm(skb, 1);
1741

1742
			/*
1743
			 * Put the packet on the pending queue
1744
			 */
1745
			if (!skb->destructor) {
1746
				skb->destructor = sock_wfree;
1747
				skb->sk = sk;
1748
				wmem_alloc_delta += skb->truesize;
1749
			}
1750
			__skb_queue_tail(queue, skb);
1751
			continue;
1752
		}
1753

1754
		if (copy > length)
1755
			copy = length;
1756

1757
		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1758
		    skb_tailroom(skb) >= copy) {
1759
			unsigned int off;
1760

1761
			off = skb->len;
1762
			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1763
					    from, skb_put(skb, copy),
1764
					    offset, copy, off, skb) < 0) {
1765
				__skb_trim(skb, off);
1766
				err = -EFAULT;
1767
				goto error;
1768
			}
1769
		} else if (flags & MSG_SPLICE_PAGES) {
1770
			struct msghdr *msg = from;
1771

1772
			err = -EIO;
1773
			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1774
				goto error;
1775

1776
			err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
1777
			if (err < 0)
1778
				goto error;
1779
			copy = err;
1780
			wmem_alloc_delta += copy;
1781
		} else if (!zc) {
1782
			int i = skb_shinfo(skb)->nr_frags;
1783

1784
			err = -ENOMEM;
1785
			if (!sk_page_frag_refill(sk, pfrag))
1786
				goto error;
1787

1788
			skb_zcopy_downgrade_managed(skb);
1789
			if (!skb_can_coalesce(skb, i, pfrag->page,
1790
					      pfrag->offset)) {
1791
				err = -EMSGSIZE;
1792
				if (i == MAX_SKB_FRAGS)
1793
					goto error;
1794

1795
				__skb_fill_page_desc(skb, i, pfrag->page,
1796
						     pfrag->offset, 0);
1797
				skb_shinfo(skb)->nr_frags = ++i;
1798
				get_page(pfrag->page);
1799
			}
1800
			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1801
			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1802
				    from,
1803
				    page_address(pfrag->page) + pfrag->offset,
1804
				    offset, copy, skb->len, skb) < 0)
1805
				goto error_efault;
1806

1807
			pfrag->offset += copy;
1808
			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1809
			skb->len += copy;
1810
			skb->data_len += copy;
1811
			skb->truesize += copy;
1812
			wmem_alloc_delta += copy;
1813
		} else {
1814
			err = skb_zerocopy_iter_dgram(skb, from, copy);
1815
			if (err < 0)
1816
				goto error;
1817
		}
1818
		offset += copy;
1819
		length -= copy;
1820
	}
1821

1822
	if (wmem_alloc_delta)
1823
		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1824
	return 0;
1825

1826
error_efault:
1827
	err = -EFAULT;
1828
error:
1829
	net_zcopy_put_abort(uarg, extra_uref);
1830
	cork->length -= length;
1831
	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1832
	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1833
	if (hold_tskey)
1834
		atomic_dec(&sk->sk_tskey);
1835
	return err;
1836
}
1837

1838
int ip6_append_data(struct sock *sk,
1839
		    int getfrag(void *from, char *to, int offset, int len,
1840
				int odd, struct sk_buff *skb),
1841
		    void *from, size_t length, int transhdrlen,
1842
		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1843
		    struct rt6_info *rt, unsigned int flags)
1844
{
1845
	struct inet_sock *inet = inet_sk(sk);
1846
	struct ipv6_pinfo *np = inet6_sk(sk);
1847
	int exthdrlen;
1848
	int err;
1849

1850
	if (flags&MSG_PROBE)
1851
		return 0;
1852
	if (skb_queue_empty(&sk->sk_write_queue)) {
1853
		/*
1854
		 * setup for corking
1855
		 */
1856
		dst_hold(&rt->dst);
1857
		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1858
				     ipc6, rt);
1859
		if (err)
1860
			return err;
1861

1862
		inet->cork.fl.u.ip6 = *fl6;
1863
		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1864
		length += exthdrlen;
1865
		transhdrlen += exthdrlen;
1866
	} else {
1867
		transhdrlen = 0;
1868
	}
1869

1870
	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1871
				 &np->cork, sk_page_frag(sk), getfrag,
1872
				 from, length, transhdrlen, flags);
1873
}
1874
EXPORT_SYMBOL_GPL(ip6_append_data);
1875

1876
static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1877
{
1878
	struct dst_entry *dst = cork->base.dst;
1879

1880
	cork->base.dst = NULL;
1881
	skb_dst_set(skb, dst);
1882
}
1883

1884
static void ip6_cork_release(struct inet_cork_full *cork,
1885
			     struct inet6_cork *v6_cork)
1886
{
1887
	if (v6_cork->opt) {
1888
		struct ipv6_txoptions *opt = v6_cork->opt;
1889

1890
		kfree(opt->dst0opt);
1891
		kfree(opt->dst1opt);
1892
		kfree(opt->hopopt);
1893
		kfree(opt->srcrt);
1894
		kfree(opt);
1895
		v6_cork->opt = NULL;
1896
	}
1897

1898
	if (cork->base.dst) {
1899
		dst_release(cork->base.dst);
1900
		cork->base.dst = NULL;
1901
	}
1902
}
1903

1904
struct sk_buff *__ip6_make_skb(struct sock *sk,
1905
			       struct sk_buff_head *queue,
1906
			       struct inet_cork_full *cork,
1907
			       struct inet6_cork *v6_cork)
1908
{
1909
	struct sk_buff *skb, *tmp_skb;
1910
	struct sk_buff **tail_skb;
1911
	struct in6_addr *final_dst;
1912
	struct net *net = sock_net(sk);
1913
	struct ipv6hdr *hdr;
1914
	struct ipv6_txoptions *opt = v6_cork->opt;
1915
	struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1916
	struct flowi6 *fl6 = &cork->fl.u.ip6;
1917
	unsigned char proto = fl6->flowi6_proto;
1918

1919
	skb = __skb_dequeue(queue);
1920
	if (!skb)
1921
		goto out;
1922
	tail_skb = &(skb_shinfo(skb)->frag_list);
1923

1924
	/* move skb->data to ip header from ext header */
1925
	if (skb->data < skb_network_header(skb))
1926
		__skb_pull(skb, skb_network_offset(skb));
1927
	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1928
		__skb_pull(tmp_skb, skb_network_header_len(skb));
1929
		*tail_skb = tmp_skb;
1930
		tail_skb = &(tmp_skb->next);
1931
		skb->len += tmp_skb->len;
1932
		skb->data_len += tmp_skb->len;
1933
		skb->truesize += tmp_skb->truesize;
1934
		tmp_skb->destructor = NULL;
1935
		tmp_skb->sk = NULL;
1936
	}
1937

1938
	/* Allow local fragmentation. */
1939
	skb->ignore_df = ip6_sk_ignore_df(sk);
1940
	__skb_pull(skb, skb_network_header_len(skb));
1941

1942
	final_dst = &fl6->daddr;
1943
	if (opt && opt->opt_flen)
1944
		ipv6_push_frag_opts(skb, opt, &proto);
1945
	if (opt && opt->opt_nflen)
1946
		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1947

1948
	skb_push(skb, sizeof(struct ipv6hdr));
1949
	skb_reset_network_header(skb);
1950
	hdr = ipv6_hdr(skb);
1951

1952
	ip6_flow_hdr(hdr, v6_cork->tclass,
1953
		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1954
					ip6_autoflowlabel(net, sk), fl6));
1955
	hdr->hop_limit = v6_cork->hop_limit;
1956
	hdr->nexthdr = proto;
1957
	hdr->saddr = fl6->saddr;
1958
	hdr->daddr = *final_dst;
1959

1960
	skb->priority = cork->base.priority;
1961
	skb->mark = cork->base.mark;
1962
	if (sk_is_tcp(sk))
1963
		skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
1964
	else
1965
		skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
1966

1967
	ip6_cork_steal_dst(skb, cork);
1968
	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1969
	if (proto == IPPROTO_ICMPV6) {
1970
		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1971
		u8 icmp6_type;
1972

1973
		if (sk->sk_socket->type == SOCK_RAW &&
1974
		   !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1975
			icmp6_type = fl6->fl6_icmp_type;
1976
		else
1977
			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1978
		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1979
		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1980
	}
1981

1982
	ip6_cork_release(cork, v6_cork);
1983
out:
1984
	return skb;
1985
}
1986

1987
int ip6_send_skb(struct sk_buff *skb)
1988
{
1989
	struct net *net = sock_net(skb->sk);
1990
	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
1991
	int err;
1992

1993
	rcu_read_lock();
1994
	err = ip6_local_out(net, skb->sk, skb);
1995
	if (err) {
1996
		if (err > 0)
1997
			err = net_xmit_errno(err);
1998
		if (err)
1999
			IP6_INC_STATS(net, rt->rt6i_idev,
2000
				      IPSTATS_MIB_OUTDISCARDS);
2001
	}
2002

2003
	rcu_read_unlock();
2004
	return err;
2005
}
2006

2007
int ip6_push_pending_frames(struct sock *sk)
2008
{
2009
	struct sk_buff *skb;
2010

2011
	skb = ip6_finish_skb(sk);
2012
	if (!skb)
2013
		return 0;
2014

2015
	return ip6_send_skb(skb);
2016
}
2017
EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2018

2019
static void __ip6_flush_pending_frames(struct sock *sk,
2020
				       struct sk_buff_head *queue,
2021
				       struct inet_cork_full *cork,
2022
				       struct inet6_cork *v6_cork)
2023
{
2024
	struct sk_buff *skb;
2025

2026
	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2027
		if (skb_dst(skb))
2028
			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2029
				      IPSTATS_MIB_OUTDISCARDS);
2030
		kfree_skb(skb);
2031
	}
2032

2033
	ip6_cork_release(cork, v6_cork);
2034
}
2035

2036
void ip6_flush_pending_frames(struct sock *sk)
2037
{
2038
	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2039
				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2040
}
2041
EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2042

2043
struct sk_buff *ip6_make_skb(struct sock *sk,
2044
			     int getfrag(void *from, char *to, int offset,
2045
					 int len, int odd, struct sk_buff *skb),
2046
			     void *from, size_t length, int transhdrlen,
2047
			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2048
			     unsigned int flags, struct inet_cork_full *cork)
2049
{
2050
	struct inet6_cork v6_cork;
2051
	struct sk_buff_head queue;
2052
	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2053
	int err;
2054

2055
	if (flags & MSG_PROBE) {
2056
		dst_release(&rt->dst);
2057
		return NULL;
2058
	}
2059

2060
	__skb_queue_head_init(&queue);
2061

2062
	cork->base.flags = 0;
2063
	cork->base.addr = 0;
2064
	cork->base.opt = NULL;
2065
	v6_cork.opt = NULL;
2066
	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2067
	if (err) {
2068
		ip6_cork_release(cork, &v6_cork);
2069
		return ERR_PTR(err);
2070
	}
2071

2072
	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2073
				&current->task_frag, getfrag, from,
2074
				length + exthdrlen, transhdrlen + exthdrlen,
2075
				flags);
2076
	if (err) {
2077
		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2078
		return ERR_PTR(err);
2079
	}
2080

2081
	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2082
}
2083

2084
Product

Resources

Company