1 /* $OpenBSD: ip_input.c,v 1.122.2.1 2005/06/14 01:49:24 brad Exp $ */
2 /* $NetBSD: ip_input.c,v 1.30 1996/03/16 23:53:58 christos Exp $ */
3
4 /*
5 * Copyright (c) 1982, 1986, 1988, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94
33 */
34
35 #include "pf.h"
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/mbuf.h>
40 #include <sys/domain.h>
41 #include <sys/protosw.h>
42 #include <sys/socket.h>
43 #include <sys/syslog.h>
44 #include <sys/sysctl.h>
45
46 #include <net/if.h>
47 #include <net/if_dl.h>
48 #include <net/route.h>
49
50 #include <netinet/in.h>
51 #include <netinet/in_systm.h>
52 #include <netinet/if_ether.h>
53 #include <netinet/ip.h>
54 #include <netinet/in_pcb.h>
55 #include <netinet/in_var.h>
56 #include <netinet/ip_var.h>
57 #include <netinet/ip_icmp.h>
58
59 #if NPF > 0
60 #include <net/pfvar.h>
61 #endif
62
63 #ifdef IPSEC
64 #include <netinet/ip_ipsp.h>
65 #endif /* IPSEC */
66
67 #ifndef IPFORWARDING
68 #ifdef GATEWAY
69 #define IPFORWARDING 1 /* forward IP packets not for us */
70 #else /* GATEWAY */
71 #define IPFORWARDING 0 /* don't forward IP packets not for us */
72 #endif /* GATEWAY */
73 #endif /* IPFORWARDING */
74 #ifndef IPSENDREDIRECTS
75 #define IPSENDREDIRECTS 1
76 #endif
77
78 #ifndef IPMTUDISC
79 #define IPMTUDISC 1
80 #endif
81 #ifndef IPMTUDISCTIMEOUT
82 #define IPMTUDISCTIMEOUT (10 * 60) /* as per RFC 1191 */
83 #endif
84
85 struct ipqhead ipq;
86
87 int encdebug = 0;
88 int ipsec_keep_invalid = IPSEC_DEFAULT_EMBRYONIC_SA_TIMEOUT;
89 int ipsec_require_pfs = IPSEC_DEFAULT_PFS;
90 int ipsec_soft_allocations = IPSEC_DEFAULT_SOFT_ALLOCATIONS;
91 int ipsec_exp_allocations = IPSEC_DEFAULT_EXP_ALLOCATIONS;
92 int ipsec_soft_bytes = IPSEC_DEFAULT_SOFT_BYTES;
93 int ipsec_exp_bytes = IPSEC_DEFAULT_EXP_BYTES;
94 int ipsec_soft_timeout = IPSEC_DEFAULT_SOFT_TIMEOUT;
95 int ipsec_exp_timeout = IPSEC_DEFAULT_EXP_TIMEOUT;
96 int ipsec_soft_first_use = IPSEC_DEFAULT_SOFT_FIRST_USE;
97 int ipsec_exp_first_use = IPSEC_DEFAULT_EXP_FIRST_USE;
98 int ipsec_expire_acquire = IPSEC_DEFAULT_EXPIRE_ACQUIRE;
99 char ipsec_def_enc[20];
100 char ipsec_def_auth[20];
101 char ipsec_def_comp[20];
102
103 /*
104 * Note: DIRECTED_BROADCAST is handled this way so that previous
105 * configuration using this option will Just Work.
106 */
107 #ifndef IPDIRECTEDBCAST
108 #ifdef DIRECTED_BROADCAST
109 #define IPDIRECTEDBCAST 1
110 #else
111 #define IPDIRECTEDBCAST 0
112 #endif /* DIRECTED_BROADCAST */
113 #endif /* IPDIRECTEDBCAST */
114 int ipforwarding = IPFORWARDING;
115 int ipsendredirects = IPSENDREDIRECTS;
116 int ip_dosourceroute = 0; /* no src-routing unless sysctl'd to enable */
117 int ip_defttl = IPDEFTTL;
118 int ip_mtudisc = IPMTUDISC;
119 u_int ip_mtudisc_timeout = IPMTUDISCTIMEOUT;
120 int ip_directedbcast = IPDIRECTEDBCAST;
121 #ifdef DIAGNOSTIC
122 int ipprintfs = 0;
123 #endif
124
125 struct rttimer_queue *ip_mtudisc_timeout_q = NULL;
126
127 int ipsec_auth_default_level = IPSEC_AUTH_LEVEL_DEFAULT;
128 int ipsec_esp_trans_default_level = IPSEC_ESP_TRANS_LEVEL_DEFAULT;
129 int ipsec_esp_network_default_level = IPSEC_ESP_NETWORK_LEVEL_DEFAULT;
130 int ipsec_ipcomp_default_level = IPSEC_IPCOMP_LEVEL_DEFAULT;
131
132 /* Keep track of memory used for reassembly */
133 int ip_maxqueue = 300;
134 int ip_frags = 0;
135
136 /* from in_pcb.c */
137 extern int ipport_firstauto;
138 extern int ipport_lastauto;
139 extern int ipport_hifirstauto;
140 extern int ipport_hilastauto;
141 extern struct baddynamicports baddynamicports;
142
143 int *ipctl_vars[IPCTL_MAXID] = IPCTL_VARS;
144
145 extern struct domain inetdomain;
146 extern struct protosw inetsw[];
147 u_char ip_protox[IPPROTO_MAX];
148 int ipqmaxlen = IFQ_MAXLEN;
149 struct in_ifaddrhead in_ifaddr;
150 struct ifqueue ipintrq;
151
152 int ipq_locked;
153 static __inline int ipq_lock_try(void);
154 static __inline void ipq_unlock(void);
155
156 struct pool ipqent_pool;
157
158 struct ipstat ipstat;
159
160 static __inline int
ipq_lock_try()161 ipq_lock_try()
162 {
163 int s;
164
165 s = splimp();
166 if (ipq_locked) {
167 splx(s);
168 return (0);
169 }
170 ipq_locked = 1;
171 splx(s);
172 return (1);
173 }
174
175 #define ipq_lock() ipq_lock_try()
176
177 static __inline void
ipq_unlock()178 ipq_unlock()
179 {
180 int s;
181
182 s = splimp();
183 ipq_locked = 0;
184 splx(s);
185 }
186
187 char *
inet_ntoa(ina)188 inet_ntoa(ina)
189 struct in_addr ina;
190 {
191 static char buf[4*sizeof "123"];
192 unsigned char *ucp = (unsigned char *)&ina;
193
194 snprintf(buf, sizeof buf, "%d.%d.%d.%d",
195 ucp[0] & 0xff, ucp[1] & 0xff,
196 ucp[2] & 0xff, ucp[3] & 0xff);
197 return (buf);
198 }
199
200 /*
201 * We need to save the IP options in case a protocol wants to respond
202 * to an incoming packet over the same route if the packet got here
203 * using IP source routing. This allows connection establishment and
204 * maintenance when the remote end is on a network that is not known
205 * to us.
206 */
207 int ip_nhops = 0;
208 static struct ip_srcrt {
209 struct in_addr dst; /* final destination */
210 char nop; /* one NOP to align */
211 char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */
212 struct in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)];
213 } ip_srcrt;
214
215 static void save_rte(u_char *, struct in_addr);
216 static int ip_weadvertise(u_int32_t);
217
218 /*
219 * IP initialization: fill in IP protocol switch table.
220 * All protocols not implemented in kernel go to raw IP protocol handler.
221 */
222 void
ip_init()223 ip_init()
224 {
225 struct protosw *pr;
226 int i;
227 const u_int16_t defbaddynamicports_tcp[] = DEFBADDYNAMICPORTS_TCP;
228 const u_int16_t defbaddynamicports_udp[] = DEFBADDYNAMICPORTS_UDP;
229
230 pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl",
231 NULL);
232
233 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
234 if (pr == 0)
235 panic("ip_init");
236 for (i = 0; i < IPPROTO_MAX; i++)
237 ip_protox[i] = pr - inetsw;
238 for (pr = inetdomain.dom_protosw;
239 pr < inetdomain.dom_protoswNPROTOSW; pr++)
240 if (pr->pr_domain->dom_family == PF_INET &&
241 pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
242 ip_protox[pr->pr_protocol] = pr - inetsw;
243 LIST_INIT(&ipq);
244 ipintrq.ifq_maxlen = ipqmaxlen;
245 TAILQ_INIT(&in_ifaddr);
246 if (ip_mtudisc != 0)
247 ip_mtudisc_timeout_q =
248 rt_timer_queue_create(ip_mtudisc_timeout);
249
250 /* Fill in list of ports not to allocate dynamically. */
251 bzero((void *)&baddynamicports, sizeof(baddynamicports));
252 for (i = 0; defbaddynamicports_tcp[i] != 0; i++)
253 DP_SET(baddynamicports.tcp, defbaddynamicports_tcp[i]);
254 for (i = 0; defbaddynamicports_udp[i] != 0; i++)
255 DP_SET(baddynamicports.udp, defbaddynamicports_udp[i]);
256
257 strlcpy(ipsec_def_enc, IPSEC_DEFAULT_DEF_ENC, sizeof(ipsec_def_enc));
258 strlcpy(ipsec_def_auth, IPSEC_DEFAULT_DEF_AUTH, sizeof(ipsec_def_auth));
259 strlcpy(ipsec_def_comp, IPSEC_DEFAULT_DEF_COMP, sizeof(ipsec_def_comp));
260 }
261
262 struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
263 struct route ipforward_rt;
264
265 void
ipintr()266 ipintr()
267 {
268 struct mbuf *m;
269 int s;
270
271 while (1) {
272 /*
273 * Get next datagram off input queue and get IP header
274 * in first mbuf.
275 */
276 s = splimp();
277 IF_DEQUEUE(&ipintrq, m);
278 splx(s);
279 if (m == 0)
280 return;
281 #ifdef DIAGNOSTIC
282 if ((m->m_flags & M_PKTHDR) == 0)
283 panic("ipintr no HDR");
284 #endif
285 ipv4_input(m);
286 }
287 }
288
289 /*
290 * Ip input routine. Checksum and byte swap header. If fragmented
291 * try to reassemble. Process options. Pass to next level.
292 */
293 void
ipv4_input(m)294 ipv4_input(m)
295 struct mbuf *m;
296 {
297 struct ip *ip;
298 struct ipq *fp;
299 struct in_ifaddr *ia;
300 struct ipqent *ipqe;
301 int hlen, mff, len;
302 in_addr_t pfrdr = 0;
303 #ifdef IPSEC
304 int error, s;
305 struct tdb *tdb;
306 struct tdb_ident *tdbi;
307 struct m_tag *mtag;
308 #endif /* IPSEC */
309
310 /*
311 * If no IP addresses have been set yet but the interfaces
312 * are receiving, can't do anything with incoming packets yet.
313 */
314 if (in_ifaddr.tqh_first == 0)
315 goto bad;
316 ipstat.ips_total++;
317 if (m->m_len < sizeof (struct ip) &&
318 (m = m_pullup(m, sizeof (struct ip))) == NULL) {
319 ipstat.ips_toosmall++;
320 return;
321 }
322 ip = mtod(m, struct ip *);
323 if (ip->ip_v != IPVERSION) {
324 ipstat.ips_badvers++;
325 goto bad;
326 }
327 hlen = ip->ip_hl << 2;
328 if (hlen < sizeof(struct ip)) { /* minimum header length */
329 ipstat.ips_badhlen++;
330 goto bad;
331 }
332 if (hlen > m->m_len) {
333 if ((m = m_pullup(m, hlen)) == NULL) {
334 ipstat.ips_badhlen++;
335 return;
336 }
337 ip = mtod(m, struct ip *);
338 }
339
340 /* 127/8 must not appear on wire - RFC1122 */
341 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
342 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
343 if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) {
344 ipstat.ips_badaddr++;
345 goto bad;
346 }
347 }
348
349 if ((m->m_pkthdr.csum & M_IPV4_CSUM_IN_OK) == 0) {
350 if (m->m_pkthdr.csum & M_IPV4_CSUM_IN_BAD) {
351 ipstat.ips_inhwcsum++;
352 ipstat.ips_badsum++;
353 goto bad;
354 }
355
356 if (in_cksum(m, hlen) != 0) {
357 ipstat.ips_badsum++;
358 goto bad;
359 }
360 } else {
361 m->m_pkthdr.csum &= ~M_IPV4_CSUM_IN_OK;
362 ipstat.ips_inhwcsum++;
363 }
364
365 /* Retrieve the packet length. */
366 len = ntohs(ip->ip_len);
367
368 /*
369 * Convert fields to host representation.
370 */
371 if (len < hlen) {
372 ipstat.ips_badlen++;
373 goto bad;
374 }
375
376 /*
377 * Check that the amount of data in the buffers
378 * is at least as much as the IP header would have us expect.
379 * Trim mbufs if longer than we expect.
380 * Drop packet if shorter than we expect.
381 */
382 if (m->m_pkthdr.len < len) {
383 ipstat.ips_tooshort++;
384 goto bad;
385 }
386 if (m->m_pkthdr.len > len) {
387 if (m->m_len == m->m_pkthdr.len) {
388 m->m_len = len;
389 m->m_pkthdr.len = len;
390 } else
391 m_adj(m, len - m->m_pkthdr.len);
392 }
393
394 #if NPF > 0
395 /*
396 * Packet filter
397 */
398 pfrdr = ip->ip_dst.s_addr;
399 if (pf_test(PF_IN, m->m_pkthdr.rcvif, &m) != PF_PASS)
400 goto bad;
401 if (m == NULL)
402 return;
403
404 ip = mtod(m, struct ip *);
405 hlen = ip->ip_hl << 2;
406 pfrdr = (pfrdr != ip->ip_dst.s_addr);
407 #endif
408
409 /*
410 * Process options and, if not destined for us,
411 * ship it on. ip_dooptions returns 1 when an
412 * error was detected (causing an icmp message
413 * to be sent and the original packet to be freed).
414 */
415 ip_nhops = 0; /* for source routed packets */
416 if (hlen > sizeof (struct ip) && ip_dooptions(m)) {
417 return;
418 }
419
420 /*
421 * Check our list of addresses, to see if the packet is for us.
422 */
423 if ((ia = in_iawithaddr(ip->ip_dst, m)) != NULL &&
424 (ia->ia_ifp->if_flags & IFF_UP))
425 goto ours;
426
427 if (IN_MULTICAST(ip->ip_dst.s_addr)) {
428 struct in_multi *inm;
429 #ifdef MROUTING
430 extern struct socket *ip_mrouter;
431
432 if (m->m_flags & M_EXT) {
433 if ((m = m_pullup(m, hlen)) == NULL) {
434 ipstat.ips_toosmall++;
435 return;
436 }
437 ip = mtod(m, struct ip *);
438 }
439
440 if (ip_mrouter) {
441 /*
442 * If we are acting as a multicast router, all
443 * incoming multicast packets are passed to the
444 * kernel-level multicast forwarding function.
445 * The packet is returned (relatively) intact; if
446 * ip_mforward() returns a non-zero value, the packet
447 * must be discarded, else it may be accepted below.
448 *
449 * (The IP ident field is put in the same byte order
450 * as expected when ip_mforward() is called from
451 * ip_output().)
452 */
453 if (ip_mforward(m, m->m_pkthdr.rcvif) != 0) {
454 ipstat.ips_cantforward++;
455 m_freem(m);
456 return;
457 }
458
459 /*
460 * The process-level routing demon needs to receive
461 * all multicast IGMP packets, whether or not this
462 * host belongs to their destination groups.
463 */
464 if (ip->ip_p == IPPROTO_IGMP)
465 goto ours;
466 ipstat.ips_forward++;
467 }
468 #endif
469 /*
470 * See if we belong to the destination multicast group on the
471 * arrival interface.
472 */
473 IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm);
474 if (inm == NULL) {
475 ipstat.ips_cantforward++;
476 m_freem(m);
477 return;
478 }
479 goto ours;
480 }
481 if (ip->ip_dst.s_addr == INADDR_BROADCAST ||
482 ip->ip_dst.s_addr == INADDR_ANY)
483 goto ours;
484
485 /*
486 * Not for us; forward if possible and desirable.
487 */
488 if (ipforwarding == 0) {
489 ipstat.ips_cantforward++;
490 m_freem(m);
491 } else {
492 #ifdef IPSEC
493 /*
494 * IPsec policy check for forwarded packets. Look at
495 * inner-most IPsec SA used.
496 */
497 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
498 s = splnet();
499 if (mtag != NULL) {
500 tdbi = (struct tdb_ident *)(mtag + 1);
501 tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto);
502 } else
503 tdb = NULL;
504 ipsp_spd_lookup(m, AF_INET, hlen, &error,
505 IPSP_DIRECTION_IN, tdb, NULL);
506 splx(s);
507
508 /* Error or otherwise drop-packet indication */
509 if (error) {
510 ipstat.ips_cantforward++;
511 m_freem(m);
512 return;
513 }
514
515 /*
516 * Fall through, forward packet. Outbound IPsec policy
517 * checking will occur in ip_output().
518 */
519 #endif /* IPSEC */
520
521 ip_forward(m, pfrdr);
522 }
523 return;
524
525 ours:
526 /*
527 * If offset or IP_MF are set, must reassemble.
528 * Otherwise, nothing need be done.
529 * (We could look in the reassembly queue to see
530 * if the packet was previously fragmented,
531 * but it's not worth the time; just let them time out.)
532 */
533 if (ip->ip_off &~ htons(IP_DF | IP_RF)) {
534 if (m->m_flags & M_EXT) { /* XXX */
535 if ((m = m_pullup(m, hlen)) == NULL) {
536 ipstat.ips_toosmall++;
537 return;
538 }
539 ip = mtod(m, struct ip *);
540 }
541
542 /*
543 * Look for queue of fragments
544 * of this datagram.
545 */
546 ipq_lock();
547 for (fp = ipq.lh_first; fp != NULL; fp = fp->ipq_q.le_next)
548 if (ip->ip_id == fp->ipq_id &&
549 ip->ip_src.s_addr == fp->ipq_src.s_addr &&
550 ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
551 ip->ip_p == fp->ipq_p)
552 goto found;
553 fp = 0;
554 found:
555
556 /*
557 * Adjust ip_len to not reflect header,
558 * set ipqe_mff if more fragments are expected,
559 * convert offset of this to bytes.
560 */
561 ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
562 mff = (ip->ip_off & htons(IP_MF)) != 0;
563 if (mff) {
564 /*
565 * Make sure that fragments have a data length
566 * that's a non-zero multiple of 8 bytes.
567 */
568 if (ntohs(ip->ip_len) == 0 ||
569 (ntohs(ip->ip_len) & 0x7) != 0) {
570 ipstat.ips_badfrags++;
571 ipq_unlock();
572 goto bad;
573 }
574 }
575 ip->ip_off = htons(ntohs(ip->ip_off) << 3);
576
577 /*
578 * If datagram marked as having more fragments
579 * or if this is not the first fragment,
580 * attempt reassembly; if it succeeds, proceed.
581 */
582 if (mff || ip->ip_off) {
583 ipstat.ips_fragments++;
584 if (ip_frags + 1 > ip_maxqueue) {
585 ip_flush();
586 ipstat.ips_rcvmemdrop++;
587 ipq_unlock();
588 goto bad;
589 }
590
591 ipqe = pool_get(&ipqent_pool, PR_NOWAIT);
592 if (ipqe == NULL) {
593 ipstat.ips_rcvmemdrop++;
594 ipq_unlock();
595 goto bad;
596 }
597 ip_frags++;
598 ipqe->ipqe_mff = mff;
599 ipqe->ipqe_m = m;
600 ipqe->ipqe_ip = ip;
601 m = ip_reass(ipqe, fp);
602 if (m == 0) {
603 ipq_unlock();
604 return;
605 }
606 ipstat.ips_reassembled++;
607 ip = mtod(m, struct ip *);
608 hlen = ip->ip_hl << 2;
609 ip->ip_len = htons(ntohs(ip->ip_len) + hlen);
610 } else
611 if (fp)
612 ip_freef(fp);
613 ipq_unlock();
614 }
615
616 #ifdef IPSEC
617 /*
618 * If it's a protected packet for us, skip the policy check.
619 * That's because we really only care about the properties of
620 * the protected packet, and not the intermediate versions.
621 * While this is not the most paranoid setting, it allows
622 * some flexibility in handling nested tunnels (in setting up
623 * the policies).
624 */
625 if ((ip->ip_p == IPPROTO_ESP) || (ip->ip_p == IPPROTO_AH) ||
626 (ip->ip_p == IPPROTO_IPCOMP))
627 goto skipipsec;
628
629 /*
630 * If the protected packet was tunneled, then we need to
631 * verify the protected packet's information, not the
632 * external headers. Thus, skip the policy lookup for the
633 * external packet, and keep the IPsec information linked on
634 * the packet header (the encapsulation routines know how
635 * to deal with that).
636 */
637 if ((ip->ip_p == IPPROTO_IPIP) || (ip->ip_p == IPPROTO_IPV6))
638 goto skipipsec;
639
640 /*
641 * If the protected packet is TCP or UDP, we'll do the
642 * policy check in the respective input routine, so we can
643 * check for bypass sockets.
644 */
645 if ((ip->ip_p == IPPROTO_TCP) || (ip->ip_p == IPPROTO_UDP))
646 goto skipipsec;
647
648 /*
649 * IPsec policy check for local-delivery packets. Look at the
650 * inner-most SA that protected the packet. This is in fact
651 * a bit too restrictive (it could end up causing packets to
652 * be dropped that semantically follow the policy, e.g., in
653 * certain SA-bundle configurations); but the alternative is
654 * very complicated (and requires keeping track of what
655 * kinds of tunneling headers have been seen in-between the
656 * IPsec headers), and I don't think we lose much functionality
657 * that's needed in the real world (who uses bundles anyway ?).
658 */
659 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
660 s = splnet();
661 if (mtag) {
662 tdbi = (struct tdb_ident *)(mtag + 1);
663 tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto);
664 } else
665 tdb = NULL;
666 ipsp_spd_lookup(m, AF_INET, hlen, &error, IPSP_DIRECTION_IN,
667 tdb, NULL);
668 splx(s);
669
670 /* Error or otherwise drop-packet indication. */
671 if (error) {
672 ipstat.ips_cantforward++;
673 m_freem(m);
674 return;
675 }
676
677 skipipsec:
678 /* Otherwise, just fall through and deliver the packet */
679 #endif /* IPSEC */
680
681 /*
682 * Switch out to protocol's input routine.
683 */
684 ipstat.ips_delivered++;
685 (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen, NULL, 0);
686 return;
687 bad:
688 m_freem(m);
689 }
690
691 struct in_ifaddr *
in_iawithaddr(ina,m)692 in_iawithaddr(ina, m)
693 struct in_addr ina;
694 struct mbuf *m;
695 {
696 struct in_ifaddr *ia;
697
698 TAILQ_FOREACH(ia, &in_ifaddr, ia_list) {
699 if ((ina.s_addr == ia->ia_addr.sin_addr.s_addr) ||
700 ((ia->ia_ifp->if_flags & (IFF_LOOPBACK|IFF_LINK1)) ==
701 (IFF_LOOPBACK|IFF_LINK1) &&
702 ia->ia_subnet == (ina.s_addr & ia->ia_subnetmask)))
703 return ia;
704 if (((ip_directedbcast == 0) || (m && ip_directedbcast &&
705 ia->ia_ifp == m->m_pkthdr.rcvif)) &&
706 (ia->ia_ifp->if_flags & IFF_BROADCAST)) {
707 if (ina.s_addr == ia->ia_broadaddr.sin_addr.s_addr ||
708 ina.s_addr == ia->ia_netbroadcast.s_addr ||
709 /*
710 * Look for all-0's host part (old broadcast addr),
711 * either for subnet or net.
712 */
713 ina.s_addr == ia->ia_subnet ||
714 ina.s_addr == ia->ia_net) {
715 /* Make sure M_BCAST is set */
716 if (m)
717 m->m_flags |= M_BCAST;
718 return ia;
719 }
720 }
721 }
722
723 return NULL;
724 }
725
726 /*
727 * Take incoming datagram fragment and try to
728 * reassemble it into whole datagram. If a chain for
729 * reassembly of this datagram already exists, then it
730 * is given as fp; otherwise have to make a chain.
731 */
732 struct mbuf *
ip_reass(ipqe,fp)733 ip_reass(ipqe, fp)
734 struct ipqent *ipqe;
735 struct ipq *fp;
736 {
737 struct mbuf *m = ipqe->ipqe_m;
738 struct ipqent *nq, *p, *q;
739 struct ip *ip;
740 struct mbuf *t;
741 int hlen = ipqe->ipqe_ip->ip_hl << 2;
742 int i, next;
743 u_int8_t ecn, ecn0;
744
745 /*
746 * Presence of header sizes in mbufs
747 * would confuse code below.
748 */
749 m->m_data += hlen;
750 m->m_len -= hlen;
751
752 /*
753 * If first fragment to arrive, create a reassembly queue.
754 */
755 if (fp == 0) {
756 MALLOC(fp, struct ipq *, sizeof (struct ipq),
757 M_FTABLE, M_NOWAIT);
758 if (fp == NULL)
759 goto dropfrag;
760 LIST_INSERT_HEAD(&ipq, fp, ipq_q);
761 fp->ipq_ttl = IPFRAGTTL;
762 fp->ipq_p = ipqe->ipqe_ip->ip_p;
763 fp->ipq_id = ipqe->ipqe_ip->ip_id;
764 LIST_INIT(&fp->ipq_fragq);
765 fp->ipq_src = ipqe->ipqe_ip->ip_src;
766 fp->ipq_dst = ipqe->ipqe_ip->ip_dst;
767 p = NULL;
768 goto insert;
769 }
770
771 /*
772 * Handle ECN by comparing this segment with the first one;
773 * if CE is set, do not lose CE.
774 * drop if CE and not-ECT are mixed for the same packet.
775 */
776 ecn = ipqe->ipqe_ip->ip_tos & IPTOS_ECN_MASK;
777 ecn0 = fp->ipq_fragq.lh_first->ipqe_ip->ip_tos & IPTOS_ECN_MASK;
778 if (ecn == IPTOS_ECN_CE) {
779 if (ecn0 == IPTOS_ECN_NOTECT)
780 goto dropfrag;
781 if (ecn0 != IPTOS_ECN_CE)
782 fp->ipq_fragq.lh_first->ipqe_ip->ip_tos |= IPTOS_ECN_CE;
783 }
784 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
785 goto dropfrag;
786
787 /*
788 * Find a segment which begins after this one does.
789 */
790 for (p = NULL, q = fp->ipq_fragq.lh_first; q != NULL;
791 p = q, q = q->ipqe_q.le_next)
792 if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off))
793 break;
794
795 /*
796 * If there is a preceding segment, it may provide some of
797 * our data already. If so, drop the data from the incoming
798 * segment. If it provides all of our data, drop us.
799 */
800 if (p != NULL) {
801 i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) -
802 ntohs(ipqe->ipqe_ip->ip_off);
803 if (i > 0) {
804 if (i >= ntohs(ipqe->ipqe_ip->ip_len))
805 goto dropfrag;
806 m_adj(ipqe->ipqe_m, i);
807 ipqe->ipqe_ip->ip_off =
808 htons(ntohs(ipqe->ipqe_ip->ip_off) + i);
809 ipqe->ipqe_ip->ip_len =
810 htons(ntohs(ipqe->ipqe_ip->ip_len) - i);
811 }
812 }
813
814 /*
815 * While we overlap succeeding segments trim them or,
816 * if they are completely covered, dequeue them.
817 */
818 for (; q != NULL &&
819 ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) >
820 ntohs(q->ipqe_ip->ip_off); q = nq) {
821 i = (ntohs(ipqe->ipqe_ip->ip_off) +
822 ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off);
823 if (i < ntohs(q->ipqe_ip->ip_len)) {
824 q->ipqe_ip->ip_len =
825 htons(ntohs(q->ipqe_ip->ip_len) - i);
826 q->ipqe_ip->ip_off =
827 htons(ntohs(q->ipqe_ip->ip_off) + i);
828 m_adj(q->ipqe_m, i);
829 break;
830 }
831 nq = q->ipqe_q.le_next;
832 m_freem(q->ipqe_m);
833 LIST_REMOVE(q, ipqe_q);
834 pool_put(&ipqent_pool, q);
835 ip_frags--;
836 }
837
838 insert:
839 /*
840 * Stick new segment in its place;
841 * check for complete reassembly.
842 */
843 if (p == NULL) {
844 LIST_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q);
845 } else {
846 LIST_INSERT_AFTER(p, ipqe, ipqe_q);
847 }
848 next = 0;
849 for (p = NULL, q = fp->ipq_fragq.lh_first; q != NULL;
850 p = q, q = q->ipqe_q.le_next) {
851 if (ntohs(q->ipqe_ip->ip_off) != next)
852 return (0);
853 next += ntohs(q->ipqe_ip->ip_len);
854 }
855 if (p->ipqe_mff)
856 return (0);
857
858 /*
859 * Reassembly is complete. Check for a bogus message size and
860 * concatenate fragments.
861 */
862 q = fp->ipq_fragq.lh_first;
863 ip = q->ipqe_ip;
864 if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) {
865 ipstat.ips_toolong++;
866 ip_freef(fp);
867 return (0);
868 }
869 m = q->ipqe_m;
870 t = m->m_next;
871 m->m_next = 0;
872 m_cat(m, t);
873 nq = q->ipqe_q.le_next;
874 pool_put(&ipqent_pool, q);
875 ip_frags--;
876 for (q = nq; q != NULL; q = nq) {
877 t = q->ipqe_m;
878 nq = q->ipqe_q.le_next;
879 pool_put(&ipqent_pool, q);
880 ip_frags--;
881 m_cat(m, t);
882 }
883
884 /*
885 * Create header for new ip packet by
886 * modifying header of first packet;
887 * dequeue and discard fragment reassembly header.
888 * Make header visible.
889 */
890 ip->ip_len = htons(next);
891 ip->ip_src = fp->ipq_src;
892 ip->ip_dst = fp->ipq_dst;
893 LIST_REMOVE(fp, ipq_q);
894 FREE(fp, M_FTABLE);
895 m->m_len += (ip->ip_hl << 2);
896 m->m_data -= (ip->ip_hl << 2);
897 /* some debugging cruft by sklower, below, will go away soon */
898 if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
899 int plen = 0;
900 for (t = m; t; t = t->m_next)
901 plen += t->m_len;
902 m->m_pkthdr.len = plen;
903 }
904 return (m);
905
906 dropfrag:
907 ipstat.ips_fragdropped++;
908 m_freem(m);
909 pool_put(&ipqent_pool, ipqe);
910 ip_frags--;
911 return (0);
912 }
913
914 /*
915 * Free a fragment reassembly header and all
916 * associated datagrams.
917 */
918 void
ip_freef(fp)919 ip_freef(fp)
920 struct ipq *fp;
921 {
922 struct ipqent *q, *p;
923
924 for (q = fp->ipq_fragq.lh_first; q != NULL; q = p) {
925 p = q->ipqe_q.le_next;
926 m_freem(q->ipqe_m);
927 LIST_REMOVE(q, ipqe_q);
928 pool_put(&ipqent_pool, q);
929 ip_frags--;
930 }
931 LIST_REMOVE(fp, ipq_q);
932 FREE(fp, M_FTABLE);
933 }
934
935 /*
936 * IP timer processing;
937 * if a timer expires on a reassembly
938 * queue, discard it.
939 */
940 void
ip_slowtimo()941 ip_slowtimo()
942 {
943 struct ipq *fp, *nfp;
944 int s = splsoftnet();
945
946 ipq_lock();
947 for (fp = ipq.lh_first; fp != NULL; fp = nfp) {
948 nfp = fp->ipq_q.le_next;
949 if (--fp->ipq_ttl == 0) {
950 ipstat.ips_fragtimeout++;
951 ip_freef(fp);
952 }
953 }
954 ipq_unlock();
955 splx(s);
956 }
957
958 /*
959 * Drain off all datagram fragments.
960 */
961 void
ip_drain()962 ip_drain()
963 {
964
965 if (ipq_lock_try() == 0)
966 return;
967 while (ipq.lh_first != NULL) {
968 ipstat.ips_fragdropped++;
969 ip_freef(ipq.lh_first);
970 }
971 ipq_unlock();
972 }
973
974 /*
975 * Flush a bunch of datagram fragments, till we are down to 75%.
976 */
977 void
ip_flush()978 ip_flush()
979 {
980 int max = 50;
981
982 /* ipq already locked */
983 while (ipq.lh_first != NULL && ip_frags > ip_maxqueue * 3 / 4 && --max) {
984 ipstat.ips_fragdropped++;
985 ip_freef(ipq.lh_first);
986 }
987 }
988
989 /*
990 * Do option processing on a datagram,
991 * possibly discarding it if bad options are encountered,
992 * or forwarding it if source-routed.
993 * Returns 1 if packet has been forwarded/freed,
994 * 0 if the packet should be processed further.
995 */
996 int
ip_dooptions(m)997 ip_dooptions(m)
998 struct mbuf *m;
999 {
1000 struct ip *ip = mtod(m, struct ip *);
1001 u_char *cp;
1002 struct ip_timestamp ipt;
1003 struct in_ifaddr *ia;
1004 int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
1005 struct in_addr sin, dst;
1006 n_time ntime;
1007
1008 dst = ip->ip_dst;
1009 cp = (u_char *)(ip + 1);
1010 cnt = (ip->ip_hl << 2) - sizeof (struct ip);
1011
1012 for (; cnt > 0; cnt -= optlen, cp += optlen) {
1013 opt = cp[IPOPT_OPTVAL];
1014 if (opt == IPOPT_EOL)
1015 break;
1016 if (opt == IPOPT_NOP)
1017 optlen = 1;
1018 else {
1019 if (cnt < IPOPT_OLEN + sizeof(*cp)) {
1020 code = &cp[IPOPT_OLEN] - (u_char *)ip;
1021 goto bad;
1022 }
1023 optlen = cp[IPOPT_OLEN];
1024 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
1025 code = &cp[IPOPT_OLEN] - (u_char *)ip;
1026 goto bad;
1027 }
1028 }
1029
1030 switch (opt) {
1031
1032 default:
1033 break;
1034
1035 /*
1036 * Source routing with record.
1037 * Find interface with current destination address.
1038 * If none on this machine then drop if strictly routed,
1039 * or do nothing if loosely routed.
1040 * Record interface address and bring up next address
1041 * component. If strictly routed make sure next
1042 * address is on directly accessible net.
1043 */
1044 case IPOPT_LSRR:
1045 case IPOPT_SSRR:
1046 if (!ip_dosourceroute) {
1047 char buf[4*sizeof "123"];
1048
1049 strlcpy(buf, inet_ntoa(ip->ip_dst),
1050 sizeof buf);
1051 log(LOG_WARNING,
1052 "attempted source route from %s to %s\n",
1053 inet_ntoa(ip->ip_src), buf);
1054 type = ICMP_UNREACH;
1055 code = ICMP_UNREACH_SRCFAIL;
1056 goto bad;
1057 }
1058 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1059 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1060 goto bad;
1061 }
1062 ipaddr.sin_addr = ip->ip_dst;
1063 ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr)));
1064 if (ia == 0) {
1065 if (opt == IPOPT_SSRR) {
1066 type = ICMP_UNREACH;
1067 code = ICMP_UNREACH_SRCFAIL;
1068 goto bad;
1069 }
1070 /*
1071 * Loose routing, and not at next destination
1072 * yet; nothing to do except forward.
1073 */
1074 break;
1075 }
1076 off--; /* 0 origin */
1077 if ((off + sizeof(struct in_addr)) > optlen) {
1078 /*
1079 * End of source route. Should be for us.
1080 */
1081 save_rte(cp, ip->ip_src);
1082 break;
1083 }
1084
1085 /*
1086 * locate outgoing interface
1087 */
1088 bcopy((caddr_t)(cp + off), (caddr_t)&ipaddr.sin_addr,
1089 sizeof(ipaddr.sin_addr));
1090 if (opt == IPOPT_SSRR) {
1091 #define INA struct in_ifaddr *
1092 #define SA struct sockaddr *
1093 if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == 0)
1094 ia = (INA)ifa_ifwithnet((SA)&ipaddr);
1095 } else
1096 ia = ip_rtaddr(ipaddr.sin_addr);
1097 if (ia == 0) {
1098 type = ICMP_UNREACH;
1099 code = ICMP_UNREACH_SRCFAIL;
1100 goto bad;
1101 }
1102 ip->ip_dst = ipaddr.sin_addr;
1103 bcopy((caddr_t)&ia->ia_addr.sin_addr,
1104 (caddr_t)(cp + off), sizeof(struct in_addr));
1105 cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1106 /*
1107 * Let ip_intr's mcast routing check handle mcast pkts
1108 */
1109 forward = !IN_MULTICAST(ip->ip_dst.s_addr);
1110 break;
1111
1112 case IPOPT_RR:
1113 if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
1114 code = &cp[IPOPT_OLEN] - (u_char *)ip;
1115 goto bad;
1116 }
1117 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1118 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1119 goto bad;
1120 }
1121
1122 /*
1123 * If no space remains, ignore.
1124 */
1125 off--; /* 0 origin */
1126 if ((off + sizeof(struct in_addr)) > optlen)
1127 break;
1128 bcopy((caddr_t)(&ip->ip_dst), (caddr_t)&ipaddr.sin_addr,
1129 sizeof(ipaddr.sin_addr));
1130 /*
1131 * locate outgoing interface; if we're the destination,
1132 * use the incoming interface (should be same).
1133 */
1134 if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == 0 &&
1135 (ia = ip_rtaddr(ipaddr.sin_addr)) == 0) {
1136 type = ICMP_UNREACH;
1137 code = ICMP_UNREACH_HOST;
1138 goto bad;
1139 }
1140 bcopy((caddr_t)&ia->ia_addr.sin_addr,
1141 (caddr_t)(cp + off), sizeof(struct in_addr));
1142 cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1143 break;
1144
1145 case IPOPT_TS:
1146 code = cp - (u_char *)ip;
1147 if (optlen < sizeof(struct ip_timestamp))
1148 goto bad;
1149 bcopy(cp, &ipt, sizeof(struct ip_timestamp));
1150 if (ipt.ipt_ptr < 5 || ipt.ipt_len < 5)
1151 goto bad;
1152 if (ipt.ipt_ptr - 1 + sizeof(n_time) > ipt.ipt_len) {
1153 if (++ipt.ipt_oflw == 0)
1154 goto bad;
1155 break;
1156 }
1157 bcopy(cp + ipt.ipt_ptr - 1, &sin, sizeof sin);
1158 switch (ipt.ipt_flg) {
1159
1160 case IPOPT_TS_TSONLY:
1161 break;
1162
1163 case IPOPT_TS_TSANDADDR:
1164 if (ipt.ipt_ptr - 1 + sizeof(n_time) +
1165 sizeof(struct in_addr) > ipt.ipt_len)
1166 goto bad;
1167 ipaddr.sin_addr = dst;
1168 ia = (INA)ifaof_ifpforaddr((SA)&ipaddr,
1169 m->m_pkthdr.rcvif);
1170 if (ia == 0)
1171 continue;
1172 bcopy((caddr_t)&ia->ia_addr.sin_addr,
1173 (caddr_t)&sin, sizeof(struct in_addr));
1174 ipt.ipt_ptr += sizeof(struct in_addr);
1175 break;
1176
1177 case IPOPT_TS_PRESPEC:
1178 if (ipt.ipt_ptr - 1 + sizeof(n_time) +
1179 sizeof(struct in_addr) > ipt.ipt_len)
1180 goto bad;
1181 bcopy((caddr_t)&sin, (caddr_t)&ipaddr.sin_addr,
1182 sizeof(struct in_addr));
1183 if (ifa_ifwithaddr((SA)&ipaddr) == 0)
1184 continue;
1185 ipt.ipt_ptr += sizeof(struct in_addr);
1186 break;
1187
1188 default:
1189 /* XXX can't take &ipt->ipt_flg */
1190 code = (u_char *)&ipt.ipt_ptr -
1191 (u_char *)ip + 1;
1192 goto bad;
1193 }
1194 ntime = iptime();
1195 bcopy((caddr_t)&ntime, (caddr_t)cp + ipt.ipt_ptr - 1,
1196 sizeof(n_time));
1197 ipt.ipt_ptr += sizeof(n_time);
1198 }
1199 }
1200 if (forward && ipforwarding) {
1201 ip_forward(m, 1);
1202 return (1);
1203 }
1204 return (0);
1205 bad:
1206 icmp_error(m, type, code, 0, 0);
1207 ipstat.ips_badoptions++;
1208 return (1);
1209 }
1210
1211 /*
1212 * Given address of next destination (final or next hop),
1213 * return internet address info of interface to be used to get there.
1214 */
1215 struct in_ifaddr *
ip_rtaddr(dst)1216 ip_rtaddr(dst)
1217 struct in_addr dst;
1218 {
1219 struct sockaddr_in *sin;
1220
1221 sin = satosin(&ipforward_rt.ro_dst);
1222
1223 if (ipforward_rt.ro_rt == 0 || dst.s_addr != sin->sin_addr.s_addr) {
1224 if (ipforward_rt.ro_rt) {
1225 RTFREE(ipforward_rt.ro_rt);
1226 ipforward_rt.ro_rt = 0;
1227 }
1228 sin->sin_family = AF_INET;
1229 sin->sin_len = sizeof(*sin);
1230 sin->sin_addr = dst;
1231
1232 rtalloc(&ipforward_rt);
1233 }
1234 if (ipforward_rt.ro_rt == 0)
1235 return ((struct in_ifaddr *)0);
1236 return (ifatoia(ipforward_rt.ro_rt->rt_ifa));
1237 }
1238
1239 /*
1240 * Save incoming source route for use in replies,
1241 * to be picked up later by ip_srcroute if the receiver is interested.
1242 */
1243 void
save_rte(option,dst)1244 save_rte(option, dst)
1245 u_char *option;
1246 struct in_addr dst;
1247 {
1248 unsigned olen;
1249
1250 olen = option[IPOPT_OLEN];
1251 #ifdef DIAGNOSTIC
1252 if (ipprintfs)
1253 printf("save_rte: olen %d\n", olen);
1254 #endif /* 0 */
1255 if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst)))
1256 return;
1257 bcopy((caddr_t)option, (caddr_t)ip_srcrt.srcopt, olen);
1258 ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
1259 ip_srcrt.dst = dst;
1260 }
1261
1262 /*
1263 * Check whether we do proxy ARP for this address and we point to ourselves.
1264 * Code shamelessly copied from arplookup().
1265 */
1266 static int
ip_weadvertise(addr)1267 ip_weadvertise(addr)
1268 u_int32_t addr;
1269 {
1270 struct rtentry *rt;
1271 struct ifnet *ifp;
1272 struct ifaddr *ifa;
1273 struct sockaddr_inarp sin;
1274
1275 sin.sin_len = sizeof(sin);
1276 sin.sin_family = AF_INET;
1277 sin.sin_addr.s_addr = addr;
1278 sin.sin_other = SIN_PROXY;
1279 rt = rtalloc1(sintosa(&sin), 0);
1280 if (rt == 0)
1281 return 0;
1282
1283 if ((rt->rt_flags & RTF_GATEWAY) || (rt->rt_flags & RTF_LLINFO) == 0 ||
1284 rt->rt_gateway->sa_family != AF_LINK) {
1285 RTFREE(rt);
1286 return 0;
1287 }
1288
1289 for (ifp = ifnet.tqh_first; ifp != 0; ifp = ifp->if_list.tqe_next)
1290 for (ifa = ifp->if_addrlist.tqh_first; ifa != 0;
1291 ifa = ifa->ifa_list.tqe_next) {
1292 if (ifa->ifa_addr->sa_family != rt->rt_gateway->sa_family)
1293 continue;
1294
1295 if (!bcmp(LLADDR((struct sockaddr_dl *)ifa->ifa_addr),
1296 LLADDR((struct sockaddr_dl *)rt->rt_gateway),
1297 ETHER_ADDR_LEN)) {
1298 RTFREE(rt);
1299 return 1;
1300 }
1301 }
1302
1303 RTFREE(rt);
1304 return 0;
1305 }
1306
1307 /*
1308 * Retrieve incoming source route for use in replies,
1309 * in the same form used by setsockopt.
1310 * The first hop is placed before the options, will be removed later.
1311 */
1312 struct mbuf *
ip_srcroute()1313 ip_srcroute()
1314 {
1315 struct in_addr *p, *q;
1316 struct mbuf *m;
1317
1318 if (ip_nhops == 0)
1319 return ((struct mbuf *)0);
1320 m = m_get(M_DONTWAIT, MT_SOOPTS);
1321 if (m == 0)
1322 return ((struct mbuf *)0);
1323
1324 #define OPTSIZ (sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt))
1325
1326 /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
1327 m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) +
1328 OPTSIZ;
1329 #ifdef DIAGNOSTIC
1330 if (ipprintfs)
1331 printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len);
1332 #endif
1333
1334 /*
1335 * First save first hop for return route
1336 */
1337 p = &ip_srcrt.route[ip_nhops - 1];
1338 *(mtod(m, struct in_addr *)) = *p--;
1339 #ifdef DIAGNOSTIC
1340 if (ipprintfs)
1341 printf(" hops %x", ntohl(mtod(m, struct in_addr *)->s_addr));
1342 #endif
1343
1344 /*
1345 * Copy option fields and padding (nop) to mbuf.
1346 */
1347 ip_srcrt.nop = IPOPT_NOP;
1348 ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
1349 bcopy((caddr_t)&ip_srcrt.nop,
1350 mtod(m, caddr_t) + sizeof(struct in_addr), OPTSIZ);
1351 q = (struct in_addr *)(mtod(m, caddr_t) +
1352 sizeof(struct in_addr) + OPTSIZ);
1353 #undef OPTSIZ
1354 /*
1355 * Record return path as an IP source route,
1356 * reversing the path (pointers are now aligned).
1357 */
1358 while (p >= ip_srcrt.route) {
1359 #ifdef DIAGNOSTIC
1360 if (ipprintfs)
1361 printf(" %x", ntohl(q->s_addr));
1362 #endif
1363 *q++ = *p--;
1364 }
1365 /*
1366 * Last hop goes to final destination.
1367 */
1368 *q = ip_srcrt.dst;
1369 #ifdef DIAGNOSTIC
1370 if (ipprintfs)
1371 printf(" %x\n", ntohl(q->s_addr));
1372 #endif
1373 return (m);
1374 }
1375
1376 /*
1377 * Strip out IP options, at higher
1378 * level protocol in the kernel.
1379 * Second argument is buffer to which options
1380 * will be moved, and return value is their length.
1381 * XXX should be deleted; last arg currently ignored.
1382 */
1383 void
ip_stripoptions(m,mopt)1384 ip_stripoptions(m, mopt)
1385 struct mbuf *m;
1386 struct mbuf *mopt;
1387 {
1388 int i;
1389 struct ip *ip = mtod(m, struct ip *);
1390 caddr_t opts;
1391 int olen;
1392
1393 olen = (ip->ip_hl<<2) - sizeof (struct ip);
1394 opts = (caddr_t)(ip + 1);
1395 i = m->m_len - (sizeof (struct ip) + olen);
1396 bcopy(opts + olen, opts, (unsigned)i);
1397 m->m_len -= olen;
1398 if (m->m_flags & M_PKTHDR)
1399 m->m_pkthdr.len -= olen;
1400 ip->ip_hl = sizeof(struct ip) >> 2;
1401 }
1402
1403 int inetctlerrmap[PRC_NCMDS] = {
1404 0, 0, 0, 0,
1405 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH,
1406 EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED,
1407 EMSGSIZE, EHOSTUNREACH, 0, 0,
1408 0, 0, 0, 0,
1409 ENOPROTOOPT
1410 };
1411
1412 /*
1413 * Forward a packet. If some error occurs return the sender
1414 * an icmp packet. Note we can't always generate a meaningful
1415 * icmp message because icmp doesn't have a large enough repertoire
1416 * of codes and types.
1417 *
1418 * If not forwarding, just drop the packet. This could be confusing
1419 * if ipforwarding was zero but some routing protocol was advancing
1420 * us as a gateway to somewhere. However, we must let the routing
1421 * protocol deal with that.
1422 *
1423 * The srcrt parameter indicates whether the packet is being forwarded
1424 * via a source route.
1425 */
1426 void
ip_forward(m,srcrt)1427 ip_forward(m, srcrt)
1428 struct mbuf *m;
1429 int srcrt;
1430 {
1431 struct ip *ip = mtod(m, struct ip *);
1432 struct sockaddr_in *sin;
1433 struct rtentry *rt;
1434 int error, type = 0, code = 0;
1435 struct mbuf *mcopy;
1436 n_long dest;
1437 struct ifnet *destifp;
1438 #ifdef IPSEC
1439 struct ifnet dummyifp;
1440 #endif
1441
1442 dest = 0;
1443 #ifdef DIAGNOSTIC
1444 if (ipprintfs)
1445 printf("forward: src %x dst %x ttl %x\n", ip->ip_src.s_addr,
1446 ip->ip_dst.s_addr, ip->ip_ttl);
1447 #endif
1448 if (m->m_flags & M_BCAST || in_canforward(ip->ip_dst) == 0) {
1449 ipstat.ips_cantforward++;
1450 m_freem(m);
1451 return;
1452 }
1453 if (ip->ip_ttl <= IPTTLDEC) {
1454 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0);
1455 return;
1456 }
1457
1458 sin = satosin(&ipforward_rt.ro_dst);
1459 if ((rt = ipforward_rt.ro_rt) == 0 ||
1460 ip->ip_dst.s_addr != sin->sin_addr.s_addr) {
1461 if (ipforward_rt.ro_rt) {
1462 RTFREE(ipforward_rt.ro_rt);
1463 ipforward_rt.ro_rt = 0;
1464 }
1465 sin->sin_family = AF_INET;
1466 sin->sin_len = sizeof(*sin);
1467 sin->sin_addr = ip->ip_dst;
1468
1469 rtalloc(&ipforward_rt);
1470 if (ipforward_rt.ro_rt == 0) {
1471 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NET, dest, 0);
1472 return;
1473 }
1474 rt = ipforward_rt.ro_rt;
1475 }
1476
1477 /*
1478 * Save at most 68 bytes of the packet in case
1479 * we need to generate an ICMP message to the src.
1480 * Pullup to avoid sharing mbuf cluster between m and mcopy.
1481 */
1482 mcopy = m_copym(m, 0, imin(ntohs(ip->ip_len), 68), M_DONTWAIT);
1483 if (mcopy)
1484 mcopy = m_pullup(mcopy, ip->ip_hl << 2);
1485
1486 ip->ip_ttl -= IPTTLDEC;
1487
1488 /*
1489 * If forwarding packet using same interface that it came in on,
1490 * perhaps should send a redirect to sender to shortcut a hop.
1491 * Only send redirect if source is sending directly to us,
1492 * and if packet was not source routed (or has any options).
1493 * Also, don't send redirect if forwarding using a default route
1494 * or a route modified by a redirect.
1495 * Don't send redirect if we advertise destination's arp address
1496 * as ours (proxy arp).
1497 */
1498 if (rt->rt_ifp == m->m_pkthdr.rcvif &&
1499 (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
1500 satosin(rt_key(rt))->sin_addr.s_addr != 0 &&
1501 ipsendredirects && !srcrt &&
1502 !ip_weadvertise(satosin(rt_key(rt))->sin_addr.s_addr)) {
1503 if (rt->rt_ifa &&
1504 (ip->ip_src.s_addr & ifatoia(rt->rt_ifa)->ia_subnetmask) ==
1505 ifatoia(rt->rt_ifa)->ia_subnet) {
1506 if (rt->rt_flags & RTF_GATEWAY)
1507 dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
1508 else
1509 dest = ip->ip_dst.s_addr;
1510 /* Router requirements says to only send host redirects */
1511 type = ICMP_REDIRECT;
1512 code = ICMP_REDIRECT_HOST;
1513 #ifdef DIAGNOSTIC
1514 if (ipprintfs)
1515 printf("redirect (%d) to %x\n", code, (u_int32_t)dest);
1516 #endif
1517 }
1518 }
1519
1520 error = ip_output(m, (struct mbuf *)0, &ipforward_rt,
1521 (IP_FORWARDING | (ip_directedbcast ? IP_ALLOWBROADCAST : 0)),
1522 0, (void *)NULL, (void *)NULL);
1523 if (error)
1524 ipstat.ips_cantforward++;
1525 else {
1526 ipstat.ips_forward++;
1527 if (type)
1528 ipstat.ips_redirectsent++;
1529 else {
1530 if (mcopy)
1531 m_freem(mcopy);
1532 return;
1533 }
1534 }
1535 if (mcopy == NULL)
1536 return;
1537 destifp = NULL;
1538
1539 switch (error) {
1540
1541 case 0: /* forwarded, but need redirect */
1542 /* type, code set above */
1543 break;
1544
1545 case ENETUNREACH: /* shouldn't happen, checked above */
1546 case EHOSTUNREACH:
1547 case ENETDOWN:
1548 case EHOSTDOWN:
1549 default:
1550 type = ICMP_UNREACH;
1551 code = ICMP_UNREACH_HOST;
1552 break;
1553
1554 case EMSGSIZE:
1555 type = ICMP_UNREACH;
1556 code = ICMP_UNREACH_NEEDFRAG;
1557
1558 #ifdef IPSEC
1559 if (ipforward_rt.ro_rt) {
1560 struct rtentry *rt = ipforward_rt.ro_rt;
1561 destifp = ipforward_rt.ro_rt->rt_ifp;
1562 /*
1563 * XXX BUG ALERT
1564 * The "dummyifp" code relies upon the fact
1565 * that icmp_error() touches only ifp->if_mtu.
1566 */
1567 if (rt->rt_rmx.rmx_mtu) {
1568 dummyifp.if_mtu = rt->rt_rmx.rmx_mtu;
1569 destifp = &dummyifp;
1570 }
1571 }
1572 #endif /*IPSEC*/
1573 ipstat.ips_cantfrag++;
1574 break;
1575
1576 case ENOBUFS:
1577 #if 1
1578 /*
1579 * a router should not generate ICMP_SOURCEQUENCH as
1580 * required in RFC1812 Requirements for IP Version 4 Routers.
1581 * source quench could be a big problem under DoS attacks,
1582 * or the underlying interface is rate-limited.
1583 */
1584 if (mcopy)
1585 m_freem(mcopy);
1586 return;
1587 #else
1588 type = ICMP_SOURCEQUENCH;
1589 code = 0;
1590 break;
1591 #endif
1592 }
1593
1594 icmp_error(mcopy, type, code, dest, destifp);
1595 }
1596
1597 int
ip_sysctl(name,namelen,oldp,oldlenp,newp,newlen)1598 ip_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
1599 int *name;
1600 u_int namelen;
1601 void *oldp;
1602 size_t *oldlenp;
1603 void *newp;
1604 size_t newlen;
1605 {
1606 int error;
1607
1608 /* All sysctl names at this level are terminal. */
1609 if (namelen != 1)
1610 return (ENOTDIR);
1611
1612 switch (name[0]) {
1613 #ifdef notyet
1614 case IPCTL_DEFMTU:
1615 return (sysctl_int(oldp, oldlenp, newp, newlen, &ip_mtu));
1616 #endif
1617 case IPCTL_SOURCEROUTE:
1618 /*
1619 * Don't allow this to change in a secure environment.
1620 */
1621 if (newp && securelevel > 0)
1622 return (EPERM);
1623 return (sysctl_int(oldp, oldlenp, newp, newlen,
1624 &ip_dosourceroute));
1625 case IPCTL_MTUDISC:
1626 error = sysctl_int(oldp, oldlenp, newp, newlen,
1627 &ip_mtudisc);
1628 if (ip_mtudisc != 0 && ip_mtudisc_timeout_q == NULL) {
1629 ip_mtudisc_timeout_q =
1630 rt_timer_queue_create(ip_mtudisc_timeout);
1631 } else if (ip_mtudisc == 0 && ip_mtudisc_timeout_q != NULL) {
1632 rt_timer_queue_destroy(ip_mtudisc_timeout_q, TRUE);
1633 Free(ip_mtudisc_timeout_q);
1634 ip_mtudisc_timeout_q = NULL;
1635 }
1636 return error;
1637 case IPCTL_MTUDISCTIMEOUT:
1638 error = sysctl_int(oldp, oldlenp, newp, newlen,
1639 &ip_mtudisc_timeout);
1640 if (ip_mtudisc_timeout_q != NULL)
1641 rt_timer_queue_change(ip_mtudisc_timeout_q,
1642 ip_mtudisc_timeout);
1643 return (error);
1644 case IPCTL_IPSEC_ENC_ALGORITHM:
1645 return (sysctl_tstring(oldp, oldlenp, newp, newlen,
1646 ipsec_def_enc, sizeof(ipsec_def_enc)));
1647 case IPCTL_IPSEC_AUTH_ALGORITHM:
1648 return (sysctl_tstring(oldp, oldlenp, newp, newlen,
1649 ipsec_def_auth,
1650 sizeof(ipsec_def_auth)));
1651 case IPCTL_IPSEC_IPCOMP_ALGORITHM:
1652 return (sysctl_tstring(oldp, oldlenp, newp, newlen,
1653 ipsec_def_comp,
1654 sizeof(ipsec_def_comp)));
1655 default:
1656 if (name[0] < IPCTL_MAXID)
1657 return (sysctl_int_arr(ipctl_vars, name, namelen,
1658 oldp, oldlenp, newp, newlen));
1659 return (EOPNOTSUPP);
1660 }
1661 /* NOTREACHED */
1662 }
1663