diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 518ebe6..4c805b3 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -2028,6 +2028,16 @@ Expression of retrans_time, which is deprecated, is in 1/100 seconds (for IPv4) or in jiffies (for IPv6). Expression of retrans_time_ms is in milliseconds. + +retrans_rand_backof_ms +---------------------- + +This is an extra delay (ms) for the retransmit timer. A random value between +0 and retrans_rand_backof_ms will be added to the retrans_timer. Default +is zero. Setting this to a larger value will help large broadcast domains +resolve ARP (for instance, 500 mac-vlans talking to 500 other mac-vlans). + + unres_qlen ---------- diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 17a6e46..f128106 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -866,6 +866,18 @@ conf/all/forwarding - BOOLEAN proxy_ndp - BOOLEAN Do proxy ndp. +nlnotify_on_addr_add - BOOLEAN + By default, netlink messages are not sent when an IPv6 address + is added if it is in tentative state. This makes it harder + for some user-space applications to function properly. To + ensure that a netlink message is always sent when an IPv6 addr + is added, regardless of the state of the address, set this value + to 1. For the old (default) behaviour, set this value to 0. + + If only certain interfaces should have this behaviour, leave the + 'all' config set to 0 and set the individual interface's value + to 1. + conf/interface/*: Change special settings per interface. diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 01e4f8b..ba21346 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -143,6 +143,7 @@ unsigned long long sched_clock(void) unsigned long long sched_clock(void) __attribute__((alias("native_sched_clock"))); #endif +EXPORT_SYMBOL(sched_clock); unsigned long native_calculate_cpu_khz(void) { diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 5153afc..c05f544 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -62,6 +62,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) sched_clock_idle_wakeup_event(0); local_irq_restore(flags); } +EXPORT_SYMBOL(sched_clock); unsigned long long native_sched_clock(void) { diff --git a/drivers/net/e100.c b/drivers/net/e100.c index 2d139ec..af4a026 100644 --- a/drivers/net/e100.c +++ b/drivers/net/e100.c @@ -1,4 +1,4 @@ -/******************************************************************************* +/************************************************************** Intel PRO/100 Linux driver Copyright(c) 1999 - 2006 Intel Corporation. @@ -394,6 +394,7 @@ enum cb_command { cb_ucode = 0x0005, cb_dump = 0x0006, cb_tx_sf = 0x0008, + cb_tx_nc = 0x0010, /* 0 == controler does CRC, ie normal. 1 == CRC from memory */ cb_cid = 0x1f00, cb_i = 0x2000, cb_s = 0x4000, @@ -430,7 +431,7 @@ struct config { /*5*/ u8 X(tx_dma_max_count:7, dma_max_count_enable:1); /*6*/ u8 X(X(X(X(X(X(X(late_scb_update:1, direct_rx_dma:1), tno_intr:1), cna_intr:1), standard_tcb:1), standard_stat_counter:1), - rx_discard_overruns:1), rx_save_bad_frames:1); + rx_save_overruns:1), rx_save_bad_frames:1); /*7*/ u8 X(X(X(X(X(rx_discard_short_frames:1, tx_underrun_retry:2), pad7:2), rx_extended_rfd:1), tx_two_frames_in_fifo:1), tx_dynamic_tbd:1); @@ -561,6 +562,8 @@ struct nic { multicast_all = (1 << 2), wol_magic = (1 << 3), ich_10h_workaround = (1 << 4), + accept_all_frames = (1 << 5), + save_fcs = (1 << 6), } flags ____cacheline_aligned; enum mac mac; @@ -1017,6 +1020,16 @@ static void e100_configure(struct nic *nic, struct cb *cb, struct sk_buff *skb) config->promiscuous_mode = 0x1; /* 1=on, 0=off */ } + if(nic->flags & accept_all_frames) { + config->rx_save_overruns = 0x1; /* 1=save, 0=discard */ + config->rx_save_bad_frames = 0x1; /* 1=save, 0=discard */ + config->rx_discard_short_frames = 0x0; /* 1=discard, 0=save */ + } + + if(nic->flags & save_fcs) { + config->rx_crc_transfer = 0x1; /* 1=save, 0=discard */ + } + if(nic->flags & multicast_all) config->multicast_all = 0x1; /* 1=accept, 0=no */ @@ -1477,6 +1490,16 @@ static void e100_set_multicast_list(struct net_device *netdev) else nic->flags &= ~promiscuous; + if(netdev->flags & IFF_ACCEPT_ALL_FRAMES) + nic->flags |= accept_all_frames; + else + nic->flags &= ~accept_all_frames; + + if(netdev->flags & IFF_SAVE_FCS) + nic->flags |= save_fcs; + else + nic->flags &= ~save_fcs; + if(netdev->flags & IFF_ALLMULTI || netdev->mc_count > E100_MAX_MULTICAST_ADDRS) nic->flags |= multicast_all; @@ -1620,6 +1643,19 @@ static void e100_xmit_prepare(struct nic *nic, struct cb *cb, struct sk_buff *skb) { cb->command = nic->tx_command; + +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + /* Use the last 4 bytes of the SKB payload packet as the CRC, used for + * testing, ie sending frames with bad CRC. + */ + if (unlikely(skb->use_specified_ether_crc)) { + cb->command |= __constant_cpu_to_le16(cb_tx_nc); + } + else { + cb->command &= ~__constant_cpu_to_le16(cb_tx_nc); + } +#endif + /* interrupt every 16 packets regardless of delay */ if((nic->cbs_avail & ~15) == nic->cbs_avail) cb->command |= cpu_to_le16(cb_i); @@ -1867,7 +1903,21 @@ static int e100_rx_indicate(struct nic *nic, struct rx *rx, skb_reserve(skb, sizeof(struct rfd)); skb_put(skb, actual_size); skb->protocol = eth_type_trans(skb, nic->netdev); - + /* NOTE: The config step turns on acceptance of various bogus frames + * when in loopback or promisc mode, but this code will still throw + * them away unless you also set the new 'accept_all_frames' flag. + * Perhaps the implementors meant to accept the bogus frames in + * promisc mode here?? --Ben + */ + if(unlikely(!(nic->flags & accept_all_frames))) { + if(actual_size > nic->netdev->mtu + VLAN_ETH_HLEN) { + /* Received oversized frame */ + dev->stats.rx_over_errors++; + } + /* We're accepting all, so pass the bogons on up the stack. */ + goto process_skb; + } + if(unlikely(!(rfd_status & cb_ok))) { /* Don't indicate if hardware indicates errors */ dev_kfree_skb_any(skb); @@ -1876,6 +1926,7 @@ static int e100_rx_indicate(struct nic *nic, struct rx *rx, nic->rx_over_length_errors++; dev_kfree_skb_any(skb); } else { + process_skb: dev->stats.rx_packets++; dev->stats.rx_bytes += actual_size; nic->netdev->last_rx = jiffies; @@ -2271,6 +2322,63 @@ static int e100_set_settings(struct net_device *netdev, struct ethtool_cmd *cmd) return err; } +static int e100_set_rxall(struct net_device *netdev, u32 data) +{ + struct nic *nic = netdev->priv; + if (data) { + netdev->priv_flags |= IFF_ACCEPT_ALL_FRAMES; + nic->flags |= accept_all_frames; + } + else { + netdev->priv_flags &= ~(IFF_ACCEPT_ALL_FRAMES); + nic->flags &= ~accept_all_frames; + } + + e100_exec_cb(nic, NULL, e100_configure); + + return 0; +} + +static int e100_get_rxall(struct net_device *netdev, u32* data) +{ + struct nic *nic = netdev->priv; + if (nic->flags & accept_all_frames) { + *data = 1; + } + else { + *data = 0; + } + + return 0; +} + +static int e100_set_save_fcs(struct net_device *netdev, u32 data) +{ + struct nic *nic = netdev->priv; + if (data) { + nic->flags |= save_fcs; + } + else { + nic->flags &= ~save_fcs; + } + e100_exec_cb(nic, NULL, e100_configure); + + return 0; +} + +static int e100_get_save_fcs(struct net_device *netdev, u32* data) +{ + struct nic *nic = netdev->priv; + if (nic->flags & save_fcs) { + *data = 1; + } + else { + *data = 0; + } + + return 0; +} + static void e100_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *info) { @@ -2565,7 +2673,12 @@ static const struct ethtool_ops e100_ethtool_ops = { .get_strings = e100_get_strings, .phys_id = e100_phys_id, .get_ethtool_stats = e100_get_ethtool_stats, + .set_rx_all = e100_set_rxall, + .get_rx_all = e100_get_rxall, + .set_save_fcs = e100_set_save_fcs, + .get_save_fcs = e100_get_save_fcs, .get_sset_count = e100_get_sset_count, + }; static int e100_do_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) diff --git a/drivers/net/e1000/e1000.h b/drivers/net/e1000/e1000.h index 3b84028..863f46d 100644 --- a/drivers/net/e1000/e1000.h +++ b/drivers/net/e1000/e1000.h @@ -359,4 +359,7 @@ extern void e1000_set_ethtool_ops(struct net_device *netdev); extern void e1000_check_options(struct e1000_adapter *adapter); + +void e1000_set_rx_mode(struct net_device *netdev); + #endif /* _E1000_H_ */ diff --git a/drivers/net/e1000/e1000_ethtool.c b/drivers/net/e1000/e1000_ethtool.c index 85e66f4..af418a8 100644 --- a/drivers/net/e1000/e1000_ethtool.c +++ b/drivers/net/e1000/e1000_ethtool.c @@ -1,4 +1,4 @@ -/******************************************************************************* +/***************************************************************** Intel PRO/1000 Linux driver Copyright(c) 1999 - 2006 Intel Corporation. @@ -1962,6 +1962,59 @@ e1000_get_strings(struct net_device *netdev, uint32_t stringset, uint8_t *data) } } +static int e1000_ethtool_setrxall(struct net_device *netdev, uint32_t val) { + unsigned short old_flags = netdev->priv_flags; + if (val) { + netdev->priv_flags |= IFF_ACCEPT_ALL_FRAMES; + } + else { + netdev->priv_flags &= ~(IFF_ACCEPT_ALL_FRAMES); + } + + /* printk("e1000_ethtool_setrxall (%s) val: %d\n", + netdev->name, val); */ + if (old_flags != netdev->priv_flags) { + netif_tx_lock_bh(netdev); + if (netif_running(netdev)) { + /*printk("Kicking e1000 for setrxall..\n");*/ + e1000_set_rx_mode(netdev); + } else { + /* Value will be flushed into the hardware when the device is + * brought up. + */ + } + netif_tx_unlock_bh(netdev); + } + return 0; +} + +static int e1000_ethtool_set_save_fcs(struct net_device *netdev, uint32_t val) { + netif_tx_lock_bh(netdev); + if (val) { + netdev->priv_flags |= IFF_SAVE_FCS; + } + else { + netdev->priv_flags &= ~IFF_SAVE_FCS; + } + netif_tx_unlock_bh(netdev); + return 0; +} + +static int e1000_ethtool_get_save_fcs(struct net_device *netdev, uint32_t* val) { + *val = !!(netdev->priv_flags & IFF_SAVE_FCS); + /*printk("GETRXALL, data: %d priv_flags: %hx\n", + edata.data, netdev->priv_flags);*/ + return 0; +} + +static int e1000_ethtool_getrxall(struct net_device *netdev, uint32_t* val) { + *val = !!(netdev->priv_flags & IFF_ACCEPT_ALL_FRAMES); + /*printk("GETRXALL, data: %d priv_flags: %hx\n", + edata.data, netdev->priv_flags);*/ + return 0; +} + + static const struct ethtool_ops e1000_ethtool_ops = { .get_settings = e1000_get_settings, .set_settings = e1000_set_settings, @@ -1991,6 +2044,10 @@ static const struct ethtool_ops e1000_ethtool_ops = { .get_strings = e1000_get_strings, .phys_id = e1000_phys_id, .get_ethtool_stats = e1000_get_ethtool_stats, + .get_rx_all = e1000_ethtool_getrxall, + .set_rx_all = e1000_ethtool_setrxall, + .set_save_fcs = e1000_ethtool_set_save_fcs, + .get_save_fcs = e1000_ethtool_get_save_fcs, .get_sset_count = e1000_get_sset_count, }; diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c index 0991648..187c618 100644 --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -1,4 +1,4 @@ -/******************************************************************************* +/***************************************************************** Intel PRO/1000 Linux driver Copyright(c) 1999 - 2006 Intel Corporation. @@ -159,7 +159,7 @@ static void e1000_clean_tx_ring(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring); static void e1000_clean_rx_ring(struct e1000_adapter *adapter, struct e1000_rx_ring *rx_ring); -static void e1000_set_rx_mode(struct net_device *netdev); +void e1000_set_rx_mode(struct net_device *netdev); static void e1000_update_phy_info(unsigned long data); static void e1000_watchdog(unsigned long data); static void e1000_82547_tx_fifo_stall(unsigned long data); @@ -1046,6 +1046,9 @@ e1000_probe(struct pci_dev *pdev, if (pci_using_dac) netdev->features |= NETIF_F_HIGHDMA; + /* Has ability to receive all frames (even bad CRCs and such) */ + netdev->features |= NETIF_F_RX_ALL | NETIF_F_SAVE_CRC; + netdev->features |= NETIF_F_LLTX; adapter->en_mng_pt = e1000_enable_mng_pass_thru(&adapter->hw); @@ -2488,7 +2491,7 @@ e1000_set_mac(struct net_device *netdev, void *p) * promiscuous mode, and all-multi behavior. **/ -static void +void e1000_set_rx_mode(struct net_device *netdev) { struct e1000_adapter *adapter = netdev_priv(netdev); @@ -2531,6 +2534,35 @@ e1000_set_rx_mode(struct net_device *netdev) E1000_WRITE_REG(hw, RCTL, rctl); + + /* This is useful for using ethereal or tcpdump to sniff + * packets in promiscuous mode without stripping VLAN/priority + * information, and also letting bad packets through. + * + * THIS IS NOT PRODUCTION CODE - FOR INTERNAL USE ONLY!!! + * + */ + if (netdev->priv_flags & IFF_ACCEPT_ALL_FRAMES) { + uint32_t ctrl; + /*printk("%s: Enabling acceptance of ALL frames (bad CRC too).\n", + netdev->name); */ + /* store bad packets, promisc/multicast all, no VLAN + * filter */ + rctl = E1000_READ_REG(hw, RCTL); + rctl |= (E1000_RCTL_SBP | E1000_RCTL_UPE | E1000_RCTL_MPE); + rctl &= ~(E1000_RCTL_VFE | E1000_RCTL_CFIEN); + E1000_WRITE_REG(hw, RCTL, rctl); + /* disable VLAN tagging/striping */ + ctrl = E1000_READ_REG(hw, CTRL); + ctrl &= ~E1000_CTRL_VME; + E1000_WRITE_REG(hw, CTRL, ctrl); + } + else { + /* TODO: Do we need a way to explicitly turn this off if it was + * previously enabled, or will it magically go back to normal??? --Ben + */ + } + /* 82542 2.0 needs to be in reset to write receive address registers */ if (hw->mac_type == e1000_82542_rev2_0) @@ -2954,6 +2986,7 @@ set_itr_now: #define E1000_TX_FLAGS_VLAN 0x00000002 #define E1000_TX_FLAGS_TSO 0x00000004 #define E1000_TX_FLAGS_IPV4 0x00000008 +#define E1000_TX_FLAGS_NO_FCS 0x00000010 #define E1000_TX_FLAGS_VLAN_MASK 0xffff0000 #define E1000_TX_FLAGS_VLAN_SHIFT 16 @@ -3204,6 +3237,13 @@ e1000_tx_queue(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring, txd_upper |= (tx_flags & E1000_TX_FLAGS_VLAN_MASK); } +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + if (unlikely(tx_flags & E1000_TX_FLAGS_NO_FCS)) { + txd_lower &= ~(E1000_TXD_CMD_IFCS); + /* printk("Disabling CRC in tx_queue, txd_lower: 0x%x\n", txd_lower); */ + } +#endif + i = tx_ring->next_to_use; while (count--) { @@ -3218,6 +3258,14 @@ e1000_tx_queue(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring, tx_desc->lower.data |= cpu_to_le32(adapter->txd_cmd); +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + /* txd_cmd re-enables FCS, so we'll re-disable it here as desired. */ + if (unlikely(tx_flags & E1000_TX_FLAGS_NO_FCS)) { + tx_desc->lower.data &= ~(cpu_to_le32(E1000_TXD_CMD_IFCS)); + /* printk("Disabling2 CRC in tx_queue, txd_lower: 0x%x\n", tx_desc->lower.data); */ + } +#endif + /* Force memory writes to complete before letting h/w * know there are new descriptors to fetch. (Only * applicable for weak-ordered memory model archs, @@ -3495,6 +3543,12 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev) if (likely(skb->protocol == htons(ETH_P_IP))) tx_flags |= E1000_TX_FLAGS_IPV4; +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + if (unlikely(skb->use_specified_ether_crc)) { + tx_flags |= E1000_TX_FLAGS_NO_FCS; + } +#endif + e1000_tx_queue(adapter, tx_ring, tx_flags, e1000_tx_map(adapter, tx_ring, skb, first, max_per_txd, nr_frags, mss)); @@ -4265,7 +4319,11 @@ e1000_clean_rx_irq(struct e1000_adapter *adapter, goto next_desc; } - if (unlikely(rx_desc->errors & E1000_RXD_ERR_FRAME_ERR_MASK)) { + /* If we are accepting all frames, then do not pay attention to the + * framing errors. + */ + if (unlikely(rx_desc->errors & E1000_RXD_ERR_FRAME_ERR_MASK) && + !(netdev->priv_flags & IFF_ACCEPT_ALL_FRAMES)) { last_byte = *(skb->data + length - 1); if (TBI_ACCEPT(&adapter->hw, status, rx_desc->errors, length, last_byte)) { @@ -4291,6 +4349,16 @@ e1000_clean_rx_irq(struct e1000_adapter *adapter, total_rx_bytes += length; total_rx_packets++; + + // This may not be needed now. --Ben + //if (netdev->priv_flags & IFF_SAVE_FCS) { + // skb_put(skb, length); + //} + //else { + // skb_put(skb, length - ETHERNET_FCS_SIZE); + //} + + /* code added for copybreak, this should improve * performance for small packets with large amounts * of reassembly being done in the stack */ @@ -4433,7 +4501,8 @@ e1000_clean_rx_irq_ps(struct e1000_adapter *adapter, goto next_desc; } - if (unlikely(staterr & E1000_RXDEXT_ERR_FRAME_ERR_MASK)) { + if ((unlikely(staterr & E1000_RXDEXT_ERR_FRAME_ERR_MASK)) && + !(netdev->priv_flags & IFF_ACCEPT_ALL_FRAMES)) { dev_kfree_skb_irq(skb); goto next_desc; } diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 34c2b98..127ee17 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -38,13 +38,6 @@ struct macvlan_port { struct list_head vlans; }; -struct macvlan_dev { - struct net_device *dev; - struct list_head list; - struct hlist_node hlist; - struct macvlan_port *port; - struct net_device *lowerdev; -}; static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, @@ -70,6 +63,9 @@ static void macvlan_broadcast(struct sk_buff *skb, struct sk_buff *nskb; unsigned int i; + if (skb->protocol == htons(ETH_P_PAUSE)) + return; + for (i = 0; i < MACVLAN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(vlan, n, &port->vlan_hash[i], hlist) { dev = vlan->dev; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 9f341b8..1059ccf 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -138,6 +138,7 @@ struct TCP_Server_Info { struct sockaddr_in sockAddr; struct sockaddr_in6 sockAddr6; } addr; + u32 ip4_local_ip; wait_queue_head_t response_q; wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/ struct list_head pending_mid_q; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8dbfa97..750dcbb 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -95,12 +95,14 @@ struct smb_vol { unsigned int sockopt; unsigned short int port; char *prepath; + u32 local_ip; /* allow binding to a local IP address if != 0 */ }; static int ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket, char *netb_name, - char *server_netb_name); + char *server_netb_name, + u32 local_ip); static int ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket); @@ -195,7 +197,8 @@ cifs_reconnect(struct TCP_Server_Info *server) rc = ipv4_connect(&server->addr.sockAddr, &server->ssocket, server->workstation_RFC1001_name, - server->server_RFC1001_name); + server->server_RFC1001_name, + server->ip4_local_ip); } if (rc) { cFYI(1, ("reconnect error %d", rc)); @@ -1038,6 +1041,18 @@ cifs_parse_mount_options(char *options, const char *devname, "long\n"); return 1; } + } else if (strnicmp(data, "local_ip", 8) == 0) { + if (!value || !*value) { + printk(KERN_WARNING "CIFS: local_ip value not specified.\n"); + return 1; /* needs_arg; */ + } + i = cifs_inet_pton(AF_INET, value, &(vol->local_ip)); + if (i < 0) { + vol->local_ip = 0; + printk(KERN_WARNING "CIFS: Could not parse local_ip: %s\n", + value); + return 1; + } } else if (strnicmp(data, "prefixpath", 10) == 0) { if (!value || !*value) { printk(KERN_WARNING @@ -1319,7 +1334,8 @@ cifs_parse_mount_options(char *options, const char *devname, static struct cifsSesInfo * cifs_find_tcp_session(struct in_addr *target_ip_addr, struct in6_addr *target_ip6_addr, - char *userName, struct TCP_Server_Info **psrvTcp) + char *userName, struct TCP_Server_Info **psrvTcp, + u32 local_ip) { struct list_head *tmp; struct cifsSesInfo *ses; @@ -1329,7 +1345,11 @@ cifs_find_tcp_session(struct in_addr *target_ip_addr, list_for_each(tmp, &GlobalSMBSessionList) { ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList); if (ses->server) { - if ((target_ip_addr && + if((target_ip_addr && + /* If binding to a local IP, do not re-use sessions bound to different + * local IP addresses. + */ + (local_ip == ses->server->ip4_local_ip) && (ses->server->addr.sockAddr.sin_addr.s_addr == target_ip_addr->s_addr)) || (target_ip6_addr && memcmp(&ses->server->addr.sockAddr6.sin6_addr, @@ -1358,7 +1378,7 @@ cifs_find_tcp_session(struct in_addr *target_ip_addr, } static struct cifsTconInfo * -find_unc(__be32 new_target_ip_addr, char *uncName, char *userName) +find_unc(__be32 new_target_ip_addr, char *uncName, char *userName, u32 local_ip) { struct list_head *tmp; struct cifsTconInfo *tcon; @@ -1373,8 +1393,9 @@ find_unc(__be32 new_target_ip_addr, char *uncName, char *userName) ("old ip addr: %x == new ip %x ?", tcon->ses->server->addr.sockAddr.sin_addr. s_addr, new_target_ip_addr)); - if (tcon->ses->server->addr.sockAddr.sin_addr. - s_addr == new_target_ip_addr) { + if ((local_ip == tcon->ses->server->ip4_local_ip) && + (tcon->ses->server->addr.sockAddr.sin_addr. + s_addr == new_target_ip_addr)) { /* BB lock tcon, server and tcp session and increment use count here? */ /* found a match on the TCP session */ /* BB check if reconnection needed */ @@ -1481,7 +1502,8 @@ static void rfc1002mangle(char *target, char *source, unsigned int length) static int ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket, - char *netbios_name, char *target_name) + char *netbios_name, char *target_name, + u32 local_ip /* in network byte order */) { int rc = 0; int connected = 0; @@ -1501,6 +1523,24 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket, } } + /* Bind to the local IP address if specified */ + if (local_ip) { + struct sockaddr_in myaddr = { + .sin_family = AF_INET, + }; + myaddr.sin_addr.s_addr = local_ip; + myaddr.sin_port = 0; /* any */ + rc = (*csocket)->ops->bind(*csocket, (struct sockaddr *) &myaddr, + sizeof(myaddr)); + if (rc < 0) { + printk("Tried to bind to local ip: 0x%x, but failed with error: %d\n", + local_ip, rc); + } + else { + printk("CIFS: Successfully bound to local ip: 0x%x\n", local_ip); + } + } + psin_server->sin_family = AF_INET; if (psin_server->sin_port) { /* user overrode default port */ rc = (*csocket)->ops->connect(*csocket, @@ -1912,12 +1952,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, if (address_type == AF_INET) existingCifsSes = cifs_find_tcp_session(&sin_server.sin_addr, NULL /* no ipv6 addr */, - volume_info.username, &srvTcp); + volume_info.username, &srvTcp, volume_info.local_ip); else if (address_type == AF_INET6) { cFYI(1, ("looking for ipv6 address")); existingCifsSes = cifs_find_tcp_session(NULL /* no ipv4 addr */, &sin_server6.sin6_addr, - volume_info.username, &srvTcp); + volume_info.username, &srvTcp, 0); } else { rc = -EINVAL; goto out; @@ -1938,7 +1978,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, } else rc = ipv4_connect(&sin_server, &csocket, volume_info.source_rfc1001_name, - volume_info.target_rfc1001_name); + volume_info.target_rfc1001_name, + volume_info.local_ip); if (rc < 0) { cERROR(1, ("Error connecting to IPv4 socket. " "Aborting operation")); @@ -1972,6 +2013,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, to the struct since the kernel thread not created yet so no need to spinlock this init of tcpStatus */ srvTcp->tcpStatus = CifsNew; + srvTcp->ip4_local_ip = volume_info.local_ip; init_MUTEX(&srvTcp->tcpSem); srvTcp->tsk = kthread_run((void *)(void *)cifs_demultiplex_thread, srvTcp, "cifsd"); if (IS_ERR(srvTcp->tsk)) { @@ -2125,7 +2167,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, tcon = find_unc(sin_server.sin_addr.s_addr, volume_info.UNC, - volume_info.username); + volume_info.username, volume_info.local_ip); if (tcon) { cFYI(1, ("Found match on UNC path")); /* we can have only one retry value for a connection diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index c6e72ae..357f2d7 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -2101,6 +2101,11 @@ COMPATIBLE_IOCTL(SIOCGMIIREG) COMPATIBLE_IOCTL(SIOCSMIIREG) COMPATIBLE_IOCTL(SIOCGIFVLAN) COMPATIBLE_IOCTL(SIOCSIFVLAN) +COMPATIBLE_IOCTL(SIOCSIFMACVLAN) +COMPATIBLE_IOCTL(SIOCGIFMACVLAN) +COMPATIBLE_IOCTL(SIOCGIFREDIRDEV) +COMPATIBLE_IOCTL(SIOCSIFREDIRDEV) +COMPATIBLE_IOCTL(0x7450 /* GET_PKTGEN_INTERFACE_INFO */) COMPATIBLE_IOCTL(SIOCBRADDBR) COMPATIBLE_IOCTL(SIOCBRDELBR) /* SG stuff */ diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 66648dd..7907e3e 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -175,7 +175,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp) RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); /* Don't talk to strangers */ - clp = nfs_find_client(svc_addr(rqstp), 4); + clp = nfs_find_client(svc_addr(rqstp), svc_addr(rqstp), 4); if (clp == NULL) return SVC_DROP; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index bb25d21..08cbf17 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -39,6 +39,7 @@ struct cb_compound_hdr_res { struct cb_getattrargs { struct sockaddr *addr; + struct sockaddr *clientaddr; struct nfs_fh fh; uint32_t bitmap[2]; }; @@ -54,6 +55,7 @@ struct cb_getattrres { struct cb_recallargs { struct sockaddr *addr; + struct sockaddr *clientaddr; struct nfs_fh fh; nfs4_stateid stateid; uint32_t truncate; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 15f7785..f82afb5 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -25,7 +25,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres * res->bitmap[0] = res->bitmap[1] = 0; res->status = htonl(NFS4ERR_BADHANDLE); - clp = nfs_find_client(args->addr, 4); + clp = nfs_find_client(args->clientaddr, args->addr, 4); if (clp == NULL) goto out; @@ -68,7 +68,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) __be32 res; res = htonl(NFS4ERR_BADHANDLE); - clp = nfs_find_client(args->addr, 4); + clp = nfs_find_client(args->clientaddr, args->addr, 4); if (clp == NULL) goto out; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 13619d2..fc4fa64 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -177,6 +177,7 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr if (unlikely(status != 0)) goto out; args->addr = svc_addr(rqstp); + args->clientaddr = svc_daddr(rqstp); status = decode_bitmap(xdr, args->bitmap); out: dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status)); @@ -189,6 +190,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, __be32 status; args->addr = svc_addr(rqstp); + args->clientaddr = svc_daddr(rqstp); status = decode_stateid(xdr, &args->stateid); if (unlikely(status != 0)) goto out; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index c5c0175..1603ecb 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -98,6 +98,7 @@ struct rpc_program nfsacl_program = { struct nfs_client_initdata { const char *hostname; const struct sockaddr *addr; + const struct sockaddr *clientaddr; size_t addrlen; const struct nfs_rpc_ops *rpc_ops; int proto; @@ -129,6 +130,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen); clp->cl_addrlen = cl_init->addrlen; + memcpy(&clp->cl_ipaddr, cl_init->clientaddr, cl_init->addrlen); if (cl_init->hostname) { clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL); @@ -245,7 +247,8 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, * Find a client by IP address and protocol version * - returns NULL if no such client */ -struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion) +struct nfs_client *nfs_find_client(const struct sockaddr *clientaddr, + const struct sockaddr *addr, u32 nfsversion) { struct nfs_client *clp; @@ -264,6 +267,8 @@ struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion) if (addr->sa_family != clap->sa_family) continue; /* Match only the IP address, not the port number */ + if (!nfs_sockaddr_match_ipaddr(clientaddr, &clp->cl_ipaddr)) + continue; if (!nfs_sockaddr_match_ipaddr(addr, clap)) continue; @@ -331,6 +336,9 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat continue; /* Match the full socket address */ + if (memcmp(&clp->cl_ipaddr, data->clientaddr, + sizeof(clp->cl_ipaddr)) != 0) + continue; if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0) continue; @@ -463,6 +471,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp, struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { .protocol = clp->cl_proto, + .saddress = (struct sockaddr *)&clp->cl_ipaddr, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, .timeout = timeparms, @@ -633,6 +642,7 @@ static int nfs_init_server(struct nfs_server *server, .hostname = data->nfs_server.hostname, .addr = (const struct sockaddr *)&data->nfs_server.address, .addrlen = data->nfs_server.addrlen, + .clientaddr = (const struct sockaddr *)&data->client.address, .rpc_ops = &nfs_v2_clientops, .proto = data->nfs_server.protocol, }; @@ -941,7 +951,6 @@ error: */ static int nfs4_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, - const char *ip_addr, rpc_authflavor_t authflavour) { int error; @@ -959,7 +968,6 @@ static int nfs4_init_client(struct nfs_client *clp, RPC_CLNT_CREATE_DISCRTRY); if (error < 0) goto error; - memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); error = nfs_idmap_new(clp); if (error < 0) { @@ -985,7 +993,7 @@ static int nfs4_set_client(struct nfs_server *server, const char *hostname, const struct sockaddr *addr, const size_t addrlen, - const char *ip_addr, + const struct sockaddr *ip_addr, rpc_authflavor_t authflavour, int proto, const struct rpc_timeout *timeparms) { @@ -993,6 +1001,7 @@ static int nfs4_set_client(struct nfs_server *server, .hostname = hostname, .addr = addr, .addrlen = addrlen, + .clientaddr = ip_addr, .rpc_ops = &nfs_v4_clientops, .proto = proto, }; @@ -1007,7 +1016,7 @@ static int nfs4_set_client(struct nfs_server *server, error = PTR_ERR(clp); goto error; } - error = nfs4_init_client(clp, timeparms, ip_addr, authflavour); + error = nfs4_init_client(clp, timeparms, authflavour); if (error < 0) goto error_put; @@ -1041,7 +1050,7 @@ static int nfs4_init_server(struct nfs_server *server, data->nfs_server.hostname, (const struct sockaddr *)&data->nfs_server.address, data->nfs_server.addrlen, - data->client_address, + (const struct sockaddr *)&data->client.address, data->auth_flavors[0], data->nfs_server.protocol, &timeparms); @@ -1157,7 +1166,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, error = nfs4_set_client(server, data->hostname, data->addr, data->addrlen, - parent_client->cl_ipaddr, + &parent_client->cl_ipaddr, data->authflavor, parent_server->client->cl_xprt->prot, parent_server->client->cl_timeout); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 4c62be1..2048eb4 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -40,7 +40,12 @@ struct nfs_parsed_mount_data { unsigned int bsize; unsigned int auth_flavor_len; rpc_authflavor_t auth_flavors[1]; - char *client_address; + + struct { + struct sockaddr_storage address; + size_t addrlen; + char *hostname; + } client; struct { struct sockaddr_storage address; @@ -66,7 +71,8 @@ struct nfs_parsed_mount_data { extern struct rpc_program nfs_program; extern void nfs_put_client(struct nfs_client *); -extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32); +extern struct nfs_client *nfs_find_client(const struct sockaddr *, + const struct sockaddr *, u32); extern struct nfs_client *nfs_find_client_next(struct nfs_client *); extern struct nfs_server *nfs_create_server( const struct nfs_parsed_mount_data *, diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 49c7cd0..1270ff9 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -38,7 +38,8 @@ struct mnt_fhstatus { * * Uses default timeout parameters specified by underlying transport. */ -int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path, +int nfs_mount(struct sockaddr *clientaddr, struct sockaddr *addr, size_t len, + char *hostname, char *path, int version, int protocol, struct nfs_fh *fh) { struct mnt_fhstatus result = { @@ -50,6 +51,7 @@ int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path, }; struct rpc_create_args args = { .protocol = protocol, + .saddress = clientaddr, .address = addr, .addrsize = len, .servername = hostname, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3a2ff77..30a4ae7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2882,8 +2882,8 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po for(;;) { setclientid.sc_name_len = scnprintf(setclientid.sc_name, - sizeof(setclientid.sc_name), "%s/%s %s %s %u", - clp->cl_ipaddr, + sizeof(setclientid.sc_name), "%u.%u.%u.%u/%s %s %s %u", + NIPQUAD(((struct sockaddr_in *)&clp->cl_ipaddr)->sin_addr), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, @@ -2895,8 +2895,9 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_NETID)); setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, - sizeof(setclientid.sc_uaddr), "%s.%u.%u", - clp->cl_ipaddr, port >> 8, port & 255); + sizeof(setclientid.sc_uaddr), "%u.%u.%u.%u.%u.%u", + NIPQUAD(((struct sockaddr_in *)&clp->cl_ipaddr)->sin_addr), + port >> 8, port & 255); status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); if (status != -NFS4ERR_CLID_INUSE) diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 531379d..d5f88eb 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -494,7 +494,7 @@ static int __init root_nfs_get_handle(void) NFS_MNT3_VERSION : NFS_MNT_VERSION; set_sockaddr(&sin, servaddr, htons(mount_port)); - status = nfs_mount((struct sockaddr *) &sin, sizeof(sin), NULL, + status = nfs_mount(NULL, (struct sockaddr *) &sin, sizeof(sin), NULL, nfs_path, version, protocol, &fh); if (status < 0) printk(KERN_ERR "Root-NFS: Server returned error %d " diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f921902..d51b8cb 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1019,8 +1019,11 @@ static int nfs_parse_mount_options(char *raw, string = match_strdup(args); if (string == NULL) goto out_nomem; - kfree(mnt->client_address); - mnt->client_address = string; + nfs_parse_server_address(string, (struct sockaddr *) + &mnt->client.address, + &mnt->client.addrlen); + kfree(mnt->client.hostname); + mnt->client.hostname = string; break; case Opt_mounthost: string = match_strdup(args); @@ -1117,7 +1120,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, * Now ask the mount server to map our export path * to a file handle. */ - status = nfs_mount(sap, + status = nfs_mount(NULL, sap, args->mount_server.addrlen, hostname, args->nfs_server.export_path, @@ -1231,6 +1234,13 @@ static int nfs_validate_mount_data(void *options, args->bsize = data->bsize; args->auth_flavors[0] = data->pseudoflavor; + /* FIXME: Should be version 7 - why do old API versions fall + * through to new ones??? + */ + memcpy(&args->client.address, &data->clientaddr, + sizeof(data->clientaddr)); + args->client.addrlen = sizeof(data->clientaddr); + /* * The legacy version 6 binary mount data from userspace has a * field used only to transport selinux information into the @@ -1798,7 +1808,10 @@ static int nfs4_validate_mount_data(void *options, c = strndup_user(data->client_addr.data, 16); if (IS_ERR(c)) return PTR_ERR(c); - args->client_address = c; + args->client.hostname = c; + args->client.addrlen = 4; + ((struct sockaddr_in *)&args->client.address)->sin_family = AF_INET; + ((struct sockaddr_in *)&args->client.address)->sin_addr.s_addr = in_aton(c); /* * Translate to nfs_parsed_mount_data, which nfs4_fill_super @@ -1861,7 +1874,7 @@ static int nfs4_validate_mount_data(void *options, dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path); - if (args->client_address == NULL) + if (args->client.hostname == NULL) goto out_no_client_address; break; @@ -1952,7 +1965,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type, error = 0; out: - kfree(data.client_address); + kfree(data.client.hostname); kfree(data.nfs_server.export_path); kfree(data.nfs_server.hostname); security_free_mnt_opts(&data.lsm_opts); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 82b3a1b..342aac7 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -429,7 +429,8 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, } } unlock_new_inode(inode); - } + } else + module_put(de->owner); return inode; out_ino: diff --git a/include/asm-x86/socket.h b/include/asm-x86/socket.h index 80af9c4..0b19fb2 100644 --- a/include/asm-x86/socket.h +++ b/include/asm-x86/socket.h @@ -54,4 +54,9 @@ #define SO_MARK 36 +/* Instruct lower device to not calculate the frame + * checksum. Useful only for testing, afaik. --Ben */ +#define SO_NOFCS 50 + + #endif /* _ASM_SOCKET_H */ diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c8d2163..f84863f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1,4 +1,4 @@ -/* +/* -*-linux-c-*- * ethtool.h: Defines for Linux ethtool. * * Copyright (C) 1998 David S. Miller (davem@redhat.com) @@ -324,7 +324,11 @@ int ethtool_op_set_flags(struct net_device *dev, u32 data); * self_test: Run specified self-tests * get_strings: Return a set of strings that describe the requested objects * phys_id: Identify the device - * get_stats: Return statistics about the device + * get_stats: Return statistics about the devic + * set_rx_all: Set or clear IFF_ACCEPT_ALL_FRAMES, see if.h + * get_rx_all: Return 1 if set, 0 if not. + * set_save_fcs: Set or clear IFF_SAVE_FCS, see if.h + * get_save_fcs: Return 1 if set, 0 if not. * get_flags: get 32-bit flags bitmap * set_flags: set 32-bit flags bitmap * @@ -383,6 +387,10 @@ struct ethtool_ops { void (*get_strings)(struct net_device *, u32 stringset, u8 *); int (*phys_id)(struct net_device *, u32); void (*get_ethtool_stats)(struct net_device *, struct ethtool_stats *, u64 *); + int (*set_rx_all)(struct net_device *, u32); + int (*get_rx_all)(struct net_device *, u32 *); + int (*set_save_fcs)(struct net_device *, u32); + int (*get_save_fcs)(struct net_device *, u32 *); int (*begin)(struct net_device *); void (*complete)(struct net_device *); u32 (*get_ufo)(struct net_device *); @@ -399,6 +407,13 @@ struct ethtool_ops { }; #endif /* __KERNEL__ */ +/* for dumping net-device statistics */ +struct ethtool_ndstats { + u32 cmd; /* ETHTOOL_GNDSTATS */ + u8 data[0]; /* sizeof(struct net_device_stats) */ +}; + + /* CMDs currently supported */ #define ETHTOOL_GSET 0x00000001 /* Get settings. */ #define ETHTOOL_SSET 0x00000002 /* Set settings. */ @@ -442,6 +457,15 @@ struct ethtool_ops { #define ETHTOOL_GPFLAGS 0x00000027 /* Get driver-private flags bitmap */ #define ETHTOOL_SPFLAGS 0x00000028 /* Set driver-private flags bitmap */ + +#define ETHTOOL_GNDSTATS 0x00000070 /* get standard net-device statistics */ +#define ETHTOOL_GETRXALL 0x00000071 /* Retrieve whether or not + * IFF_ACCEPT_ALL_FRAMES is set. */ +#define ETHTOOL_SETRXALL 0x00000072 /* Set IFF_ACCEPT_ALL_FRAMES */ +#define ETHTOOL_GETRXFCS 0x00000073 /* Set IFF_SAVE_FCS */ +#define ETHTOOL_SETRXFCS 0x00000074 /* Set IFF_SAVE_FCS */ + + /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET #define SPARC_ETH_SSET ETHTOOL_SSET diff --git a/include/linux/if.h b/include/linux/if.h index 5c9d1fa..63de1a7 100644 --- a/include/linux/if.h +++ b/include/linux/if.h @@ -65,6 +65,14 @@ #define IFF_SLAVE_NEEDARP 0x40 /* need ARPs for validation */ #define IFF_ISATAP 0x80 /* ISATAP interface (RFC4214) */ +#define IFF_ACCEPT_ALL_FRAMES 0x0400 /** Accept all frames, even ones with bad CRCs. + * Should only be used in debugging/testing situations + * Do NOT enable this unless you understand the + * consequences! */ +#define IFF_SAVE_FCS 0x0800 /** Save the Frame Check Sum (FCS) on receive, if + * possible. */ +#define IFF_MAC_VLAN 0x1000 /* MAC VLAN device. */ + #define IF_GET_IFACE 0x0001 /* for querying only */ #define IF_GET_PROTO 0x0002 diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 0d9d7ea..1a0f9ef 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -5,5 +5,13 @@ extern struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *); +struct macvlan_dev { + struct net_device *dev; + struct list_head list; + struct hlist_node hlist; + struct macvlan_port *port; + struct net_device *lowerdev; +}; + #endif /* __KERNEL__ */ #endif /* _LINUX_IF_MACVLAN_H */ diff --git a/include/linux/if_redirdev.h b/include/linux/if_redirdev.h new file mode 100644 index 0000000..cf8055c --- /dev/null +++ b/include/linux/if_redirdev.h @@ -0,0 +1,35 @@ +/* -*- linux-c -*- */ +#ifndef _LINUX_IF_REDIRDEV_H +#define _LINUX_IF_REDIRDEV_H + +/* the ioctl commands */ + +#define REDIRDEV_ADD 2090 +#define REDIRDEV_DEL 2091 +/* If this IOCTL succeedes, we are a Redirect-Device + interface, otherwise, we are not. */ +#define REDIRDEV_IS_REDIRDEV 2092 +#define REDIRDEV_GET_BY_IDX 2093 +#define REDIRDEV_GET_BY_NAME 2094 +#define REDIRDEV_SET_QUOTA 2095 + +#ifdef __KERNEL__ +#include +#include +extern int (*redirdev_ioctl_hook)(void*); + +#endif + +/* Request and response */ +struct redirdev_ioctl { + u32 cmd; + u32 ifidx; /* when getting info by idx */ + +#define RDD_ASSOCIATED (1<<0) + u32 flags; /* 1<<0: Is the interface associated with tx-dev or not */ + u32 not_used; /* explicitly align 64-bit */ + char ifname[IFNAMSIZ]; + char txifname[IFNAMSIZ]; +}; + +#endif diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index fc4e3db..1ab92bd 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -107,7 +107,8 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) #define IN_DEV_ARPFILTER(in_dev) IN_DEV_ORCONF((in_dev), ARPFILTER) #define IN_DEV_ARP_ANNOUNCE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE) #define IN_DEV_ARP_IGNORE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_IGNORE) - +#define IN_DEV_ACCEPT_STS(in_dev) IN_DEV_MAXCONF((in_dev), ACCEPT_STS) + struct in_ifaddr { struct in_ifaddr *ifa_next; diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 2584306..0d66649 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -158,6 +158,11 @@ struct ipv6_devconf { #endif __s32 proxy_ndp; __s32 accept_source_route; + __s32 nlnotify_on_addr_add; /* Always notify netlink on addr add, even if it is tentative. + * As currently implemented, this will often cause multiple netlink + * RTM_NEWADDR messages, as a new notification will be sent when + * the address becomes un-tentative. + */ #ifdef CONFIG_IPV6_OPTIMISTIC_DAD __s32 optimistic_dad; #endif @@ -192,6 +197,7 @@ enum { DEVCONF_PROXY_NDP, DEVCONF_OPTIMISTIC_DAD, DEVCONF_ACCEPT_SOURCE_ROUTE, + DEVCONF_NLNOTIFY_ON_ADDR_ADD, DEVCONF_MAX }; diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 35a8277..9bca1b8 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -30,11 +30,16 @@ #define SIOCGETSGCNT (SIOCPROTOPRIVATE+1) #define SIOCGETRPF (SIOCPROTOPRIVATE+2) +#define SIOCGETVIFCNT_NG (SIOCPROTOPRIVATE+3) +#define SIOCGETSGCNT_NG (SIOCPROTOPRIVATE+4) + #define MAXVIFS 32 typedef unsigned long vifbitmap_t; /* User mode code depends on this lot */ typedef unsigned short vifi_t; #define ALL_VIFS ((vifi_t)(-1)) +#define DFLT_MROUTE_TBL RT_TABLE_MAIN + /* * Same idea as select */ @@ -60,6 +65,11 @@ struct vifctl { struct in_addr vifc_rmt_addr; /* IPIP tunnel addr */ }; +struct vifctl_ng { + struct vifctl vif; + unsigned table_id; +} __attribute__ ((packed)); + #define VIFF_TUNNEL 0x1 /* IPIP tunnel */ #define VIFF_SRCRT 0x2 /* NI */ #define VIFF_REGISTER 0x4 /* register vif */ @@ -80,6 +90,18 @@ struct mfcctl int mfcc_expire; }; +struct mfcctl_ng +{ + struct mfcctl mfc; + unsigned int table_id; +} __attribute__ ((packed)); + +struct mrt_sockopt_simple +{ + unsigned int optval; + unsigned int table_id; +}; + /* * Group count retrieval for mrouted */ @@ -93,6 +115,12 @@ struct sioc_sg_req unsigned long wrong_if; }; +struct sioc_sg_req_ng +{ + struct sioc_sg_req req; + unsigned int table_id; +} __attribute__ ((packed)); + /* * To get vif packet counts */ @@ -106,6 +134,12 @@ struct sioc_vif_req unsigned long obytes; /* Out bytes */ }; +struct sioc_vif_req_ng +{ + struct sioc_vif_req vif; + unsigned int table_id; +} __attribute__ ((packed)); + /* * This is the format the mroute daemon expects to see IGMP control * data. Magically happens to be like an IP packet as per the original @@ -156,6 +190,8 @@ struct vif_device unsigned short flags; /* Control flags */ __be32 local,remote; /* Addresses(remote for tunnels)*/ int link; /* Physical interface index */ + int vif_index; /* Index in vif_table */ + unsigned int table_id; /* table-id that this vif belongs to */ }; #define VIFF_STATIC 0x8000 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ee81906..6dbbf1b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -43,6 +43,10 @@ #include +#ifdef CONFIG_IP_MROUTE +struct ipmr_table; +#endif + struct vlan_group; struct ethtool_ops; struct netpoll_info; @@ -178,6 +182,7 @@ enum { struct neighbour; struct neigh_parms; struct sk_buff; +struct pktgen_dev; struct netif_rx_stats { @@ -507,10 +512,15 @@ struct net_device #define NETIF_F_NETNS_LOCAL 8192 /* Does not change network namespaces */ #define NETIF_F_MULTI_QUEUE 16384 /* Has multiple TX/RX queues */ #define NETIF_F_LRO 32768 /* large receive offload */ - +#define NETIF_F_SAVE_CRC 65536 /* Can save FCS in skb, last 4 bytes for ethernet */ +#define NETIF_F_RX_ALL (1<<16) /* Can be configured to receive all packets, even + * ones with busted CRC. May disable VLAN filtering + * in the NIC, users should NOT enable this feature + * unless they understand the consequences. */ + /* Segmentation offload features */ -#define NETIF_F_GSO_SHIFT 16 -#define NETIF_F_GSO_MASK 0xffff0000 +#define NETIF_F_GSO_SHIFT 24 +#define NETIF_F_GSO_MASK 0xff000000 #define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT) #define NETIF_F_UFO (SKB_GSO_UDP << NETIF_F_GSO_SHIFT) #define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT) @@ -713,6 +723,20 @@ struct net_device /* Network namespace this network device is inside */ struct net *nd_net; + /* Callback for when the queue is woken, used by pktgen currently */ + int (*notify_queue_woken)(struct net_device *dev); + void* nqw_data; /* To be used by the method above as needed */ + + struct pktgen_dev* pkt_dev; /* to quickly find the pkt-gen dev registered with this + * interface, if any. + */ + long dflt_skb_mark; /* Specify skb->mark for pkts received on this interface. */ + +#ifdef CONFIG_IP_MROUTE + /* IPv4 Multicast Routing Table for tis device. */ + struct ipmr_table* mrt_entry; +#endif + /* bridge stuff */ struct net_bridge_port *br_port; /* macvlan */ @@ -936,8 +960,13 @@ static inline void netif_wake_queue(struct net_device *dev) return; } #endif - if (test_and_clear_bit(__LINK_STATE_XOFF, &dev->state)) + if (test_and_clear_bit(__LINK_STATE_XOFF, &dev->state)) { __netif_schedule(dev); + + if (dev->notify_queue_woken) { + dev->notify_queue_woken(dev); + } + } } /** @@ -1109,6 +1138,14 @@ extern int netdev_budget; /* Called by rtnetlink.c:rtnl_unlock() */ extern void netdev_run_todo(void); +#ifdef CONFIG_DEBUG_NETDEV +extern int netdev_debug; +extern void __dev_hold(struct net_device *, const char *); +extern void __dev_put(struct net_device *, const char *); + +#define dev_hold(dev) __dev_hold(dev, __FUNCTION__) +#define dev_put(dev) __dev_put(dev, __FUNCTION__) +#else /** * dev_put - release reference to device * @dev: network device @@ -1130,6 +1167,8 @@ static inline void dev_hold(struct net_device *dev) { atomic_inc(&dev->refcnt); } +#endif + /* Carrier loss detection, dial on demand. The functions netif_carrier_on * and _off may be called from IRQ context, but it is caller diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f4a0e4c..7ec5360 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -528,7 +528,7 @@ static inline void nfs3_forget_cached_acls(struct inode *inode) /* * linux/fs/mount_clnt.c */ -extern int nfs_mount(struct sockaddr *, size_t, char *, char *, +extern int nfs_mount(struct sockaddr *, struct sockaddr *, size_t, char *, char *, int, int, struct nfs_fh *); /* diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 3423c67..8e5f377 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -62,10 +62,10 @@ struct nfs_client { /* idmapper */ struct idmap * cl_idmap; - /* Our own IP address, as a null-terminated string. + /* Our own IP address. * This is used to generate the clientid, and the callback address. */ - char cl_ipaddr[48]; + struct sockaddr cl_ipaddr; unsigned char cl_id_uniquifier; #endif }; diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h index df7c6b7..0c6c320 100644 --- a/include/linux/nfs_mount.h +++ b/include/linux/nfs_mount.h @@ -20,7 +20,7 @@ * mount-to-kernel version compatibility. Some of these aren't used yet * but here they are anyway. */ -#define NFS_MOUNT_VERSION 6 +#define NFS_MOUNT_VERSION 7 #define NFS_MAX_CONTEXT_LEN 256 struct nfs_mount_data { @@ -43,6 +43,7 @@ struct nfs_mount_data { struct nfs3_fh root; /* 4 */ int pseudoflavor; /* 5 */ char context[NFS_MAX_CONTEXT_LEN + 1]; /* 6 */ + struct sockaddr clientaddr; /* 7 */ }; /* bits in the flags field */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bbd8d00..446f47f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -288,7 +288,8 @@ struct sk_buff { fclone:2, ipvs_property:1, peeked:1, - nf_trace:1; + nf_trace:1, + use_specified_ether_crc:1; __be16 protocol; void (*destructor)(struct sk_buff *skb); diff --git a/include/linux/sockios.h b/include/linux/sockios.h index abef759..367287c 100644 --- a/include/linux/sockios.h +++ b/include/linux/sockios.h @@ -94,6 +94,13 @@ #define SIOCGRARP 0x8961 /* get RARP table entry */ #define SIOCSRARP 0x8962 /* set RARP table entry */ +/* MAC address based VLAN control calls */ +#define SIOCGIFMACVLAN 0x8965 /* Mac address multiplex/demultiplex support */ +#define SIOCSIFMACVLAN 0x8966 /* Set macvlan options */ + +#define SIOCGIFREDIRDEV 0x8967 /* Redirect device get ioctl */ +#define SIOCSIFREDIRDEV 0x8968 /* Set redirect dev options */ + /* Driver configuration calls */ #define SIOCGIFMAP 0x8970 /* Get device parameters */ @@ -122,6 +129,7 @@ #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ #define SIOCBRDELIF 0x89a3 /* remove interface from bridge */ + /* Device private ioctl calls */ /* diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 64c9755..781d767 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -281,6 +281,11 @@ static inline struct sockaddr *svc_addr(struct svc_rqst *rqst) return (struct sockaddr *) &rqst->rq_addr; } +static inline struct sockaddr *svc_daddr(struct svc_rqst *rqst) +{ + return (struct sockaddr *) &rqst->rq_daddr; +} + /* * Check buffer bounds after decoding arguments */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 571f01d..a45b5df 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -490,6 +490,7 @@ enum NET_IPV4_CONF_ARP_IGNORE=19, NET_IPV4_CONF_PROMOTE_SECONDARIES=20, NET_IPV4_CONF_ARP_ACCEPT=21, + NET_IPV4_CONF_ACCEPT_STS=22, __NET_IPV4_CONF_MAX }; @@ -578,6 +579,7 @@ enum { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN=22, NET_IPV6_PROXY_NDP=23, NET_IPV6_ACCEPT_SOURCE_ROUTE=25, + NET_IPV6_NLNOTIFY_ON_ADDR_ADD=26, __NET_IPV6_MAX }; @@ -606,6 +608,7 @@ enum { NET_NEIGH_GC_THRESH3=16, NET_NEIGH_RETRANS_TIME_MS=17, NET_NEIGH_REACHABLE_TIME_MS=18, + NET_NEIGH_RETRANS_RAND_BACKOFF=19, __NET_NEIGH_MAX }; diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 496503c..a5c56b9 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -136,12 +136,25 @@ extern int inet6_ac_check(struct sock *sk, struct in6_addr *addr, int ifindex); extern int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr); extern int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr); extern int ipv6_chk_acast_addr(struct net_device *dev, struct in6_addr *addr); - +/*extern void ipv6_ac_destroy_dev(struct inet6_dev *idev);*/ /* Device notifier */ extern int register_inet6addr_notifier(struct notifier_block *nb); extern int unregister_inet6addr_notifier(struct notifier_block *nb); +#ifdef CONFIG_IPV6_REF_DEBUG +extern void in6_dev_atomic_inc(struct inet6_dev *idev, const char *func, int line); +extern void in6_dev_atomic_dec(struct inet6_dev *idev, const char *func, int line); +extern int in6_dev_atomic_dec_and_test(struct inet6_dev *idev, const char *func, int line); +#else +#define in6_dev_atomic_inc(idev, func, line) atomic_inc(&(idev)->refcnt) +#define in6_dev_atomic_dec(idev, func, line) atomic_dec(&(idev)->refcnt) +#define in6_dev_atomic_dec_and_test(idev, func, line) atomic_dec_and_test(&(idev)->refcnt) +#endif + +#define __in6_dev_put(idev) in6_dev_atomic_dec((idev), __func__, __LINE__) +#define in6_dev_hold(idev) in6_dev_atomic_inc((idev), __func__, __LINE__) + static inline struct inet6_dev * __in6_dev_get(struct net_device *dev) { @@ -149,40 +162,48 @@ __in6_dev_get(struct net_device *dev) } static inline struct inet6_dev * -in6_dev_get(struct net_device *dev) +____in6_dev_get(struct net_device *dev, const char *func, int line) { struct inet6_dev *idev = NULL; rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) - atomic_inc(&idev->refcnt); + in6_dev_atomic_inc(idev, func, line); rcu_read_unlock(); return idev; } +#define in6_dev_get(dev) ____in6_dev_get((dev), __func__, __LINE__) extern void in6_dev_finish_destroy(struct inet6_dev *idev); static inline void -in6_dev_put(struct inet6_dev *idev) +____in6_dev_put(struct inet6_dev *idev, const char *func, int line) { - if (atomic_dec_and_test(&idev->refcnt)) + if (in6_dev_atomic_dec_and_test(idev, func, line)) in6_dev_finish_destroy(idev); } +#define in6_dev_put(idev) ____in6_dev_put((idev), __func__, __LINE__) -#define __in6_dev_put(idev) atomic_dec(&(idev)->refcnt) -#define in6_dev_hold(idev) atomic_inc(&(idev)->refcnt) - +#ifdef CONFIG_IPV6_REF_DEBUG +extern void in6_ifa_atomic_inc(struct inet6_ifaddr *ifa, const char *func, int line); +extern void in6_ifa_atomic_dec(struct inet6_ifaddr *ifa, const char *func, int line); +extern int in6_ifa_atomic_dec_and_test(struct inet6_ifaddr *ifa, const char *func, int line); +#else +#define in6_ifa_atomic_inc(ifa, func, line) atomic_inc(&(ifa)->refcnt) +#define in6_ifa_atomic_dec(ifa, func, line) atomic_dec(&(ifa)->refcnt) +#define in6_ifa_atomic_dec_and_test(ifa, func, line) atomic_dec_and_test(&(ifa)->refcnt) +#endif extern void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp); -static inline void in6_ifa_put(struct inet6_ifaddr *ifp) +static inline void ____in6_ifa_put(struct inet6_ifaddr *ifp, const char *func, int line) { - if (atomic_dec_and_test(&ifp->refcnt)) + if (in6_ifa_atomic_dec_and_test(ifp, func, line)) inet6_ifa_finish_destroy(ifp); } - -#define __in6_ifa_put(ifp) atomic_dec(&(ifp)->refcnt) -#define in6_ifa_hold(ifp) atomic_inc(&(ifp)->refcnt) +#define in6_ifa_put(ifp) ____in6_ifa_put((ifp), __func__, __LINE__) +#define __in6_ifa_put(ifp) in6_ifa_atomic_dec((ifp), __func__, __LINE__) +#define in6_ifa_hold(ifp) in6_ifa_atomic_inc((ifp), __func__, __LINE__) extern void addrconf_forwarding_on(void); diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 64a5f01..4947976 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -65,6 +65,7 @@ struct neigh_parms int proxy_delay; int proxy_qlen; int locktime; + int retrans_rand_backoff; }; struct neigh_statistics diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h index d9a4f7f..83b9a6e 100644 --- a/include/net/netfilter/nf_conntrack_tuple.h +++ b/include/net/netfilter/nf_conntrack_tuple.h @@ -91,6 +91,8 @@ struct nf_conntrack_tuple /* The direction (for tuplehash) */ u_int8_t dir; } dst; + + u_int32_t mark; }; struct nf_conntrack_tuple_mask @@ -130,7 +132,8 @@ static inline int __nf_ct_tuple_src_equal(const struct nf_conntrack_tuple *t1, t1->src.u3.all[2] == t2->src.u3.all[2] && t1->src.u3.all[3] == t2->src.u3.all[3] && t1->src.u.all == t2->src.u.all && - t1->src.l3num == t2->src.l3num); + t1->src.l3num == t2->src.l3num && + t1->mark == t2->mark); } static inline int __nf_ct_tuple_dst_equal(const struct nf_conntrack_tuple *t1, @@ -141,7 +144,8 @@ static inline int __nf_ct_tuple_dst_equal(const struct nf_conntrack_tuple *t1, t1->dst.u3.all[2] == t2->dst.u3.all[2] && t1->dst.u3.all[3] == t2->dst.u3.all[3] && t1->dst.u.all == t2->dst.u.all && - t1->dst.protonum == t2->dst.protonum); + t1->dst.protonum == t2->dst.protonum && + t1->mark == t2->mark); } static inline int nf_ct_tuple_equal(const struct nf_conntrack_tuple *t1, @@ -177,7 +181,8 @@ static inline int nf_ct_tuple_src_mask_cmp(const struct nf_conntrack_tuple *t1, return 0; if (t1->src.l3num != t2->src.l3num || - t1->dst.protonum != t2->dst.protonum) + t1->dst.protonum != t2->dst.protonum || + t1->mark != t2->mark) return 0; return 1; diff --git a/include/net/sock.h b/include/net/sock.h index fd98760..e97029f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -406,6 +406,10 @@ enum sock_flags { SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ + SOCK_DONT_DO_LL_FCS, /* Tell NIC not to do the ethernet FCS. Will use + * last 4 bytes of packet sent from user-space + * instead. + */ }; static inline void sock_copy_flags(struct sock *nsk, struct sock *osk) diff --git a/kernel/panic.c b/kernel/panic.c index 24af9f8..57847ff 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -179,7 +179,7 @@ const char *print_tainted(void) void add_taint(unsigned flag) { - debug_locks = 0; /* can't trust the integrity of the kernel anymore */ + /* debug_locks = 0; --Ben */ /* can't trust the integrity of the kernel anymore */ tainted |= flag; } EXPORT_SYMBOL(add_taint); diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index c09350d..6f3467c 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -219,6 +219,7 @@ static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = { { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, + { NET_IPV4_CONF_ACCEPT_STS, "accept_sts" }, {} }; @@ -248,6 +249,7 @@ static const struct trans_ctl_table trans_net_neigh_vars_table[] = { { NET_NEIGH_GC_THRESH3, "gc_thresh3" }, { NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" }, { NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" }, + { NET_NEIGH_RETRANS_RAND_BACKOFF, "retrans_rand_backoff_ms"}, {} }; @@ -494,6 +496,7 @@ static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = { { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, { NET_IPV6_PROXY_NDP, "proxy_ndp" }, { NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, + { NET_IPV6_NLNOTIFY_ON_ADDR_ADD, "nlnotify_on_addr_add" }, {} }; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0796c1a..6854057 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -397,6 +397,13 @@ config DEBUG_KOBJECT If you say Y here, some extra kobject debugging messages will be sent to the syslog. +config DEBUG_NETDEV + bool "network device debugging" + depends on DEBUG_KERNEL + help + This option enables extra checking on usage and reference counting + of network devices. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index c0bac6d..91d95a2 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -435,6 +435,11 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, } break; +#if 0 + /* Don't propagate management state from base dev to VLANs. If you do this, + * then if you 'ifconfig eth0 down; ifconfig eth0 up', you also lose all the + * routes for eth0.* VLANs. --Ben + */ case NETDEV_DOWN: /* Put all VLANs for this dev in the down state too. */ for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { @@ -464,7 +469,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, dev_change_flags(vlandev, flgs | IFF_UP); } break; - +#endif + case NETDEV_UNREGISTER: /* Delete all VLANs for this dev. */ for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 41a76a0..af27671 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -367,6 +367,11 @@ static int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) struct net_device_stats *stats = &dev->stats; struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); + /* Please note, dev_queue_xmit consumes the pkt regardless of the + * return value. So, will copy the skb first and free if successful. + */ + struct sk_buff* skb2 = skb_get(skb); + /* Handle non-VLAN frames if they are sent to us, for example by DHCP. * * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING @@ -395,6 +400,10 @@ static int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) skb = __vlan_put_tag(skb, veth_TCI); if (!skb) { stats->tx_dropped++; + /* Free the extra copy, assuming this is a non-recoverable + * issue and we don't want calling code to retry. + */ + kfree_skb(skb2); return 0; } @@ -412,13 +421,21 @@ static int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) veth->h_vlan_proto, veth->h_vlan_TCI, veth->h_vlan_encapsulated_proto); - stats->tx_packets++; /* for statics only */ - stats->tx_bytes += skb->len; - skb->dev = vlan_dev_info(dev)->real_dev; - dev_queue_xmit(skb); - - return 0; + { + int rv = dev_queue_xmit(skb); + if (rv == 0) { + /* Was success, need to free the skb reference since + * we bumped up the user count above. If there was an + * error instead, then the skb2 will not be freed, and so + * the calling code will be able to re-send it. + */ + stats->tx_packets++; /* for statics only */ + stats->tx_bytes += skb2->len; + kfree_skb(skb2); + } + return rv; + } } static int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, diff --git a/net/Kconfig b/net/Kconfig index 6627c6a..fee9b8c 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -183,6 +183,7 @@ source "net/tipc/Kconfig" source "net/atm/Kconfig" source "net/bridge/Kconfig" source "net/8021q/Kconfig" +source "net/redir/Kconfig" source "net/decnet/Kconfig" source "net/llc/Kconfig" source "net/ipx/Kconfig" @@ -225,6 +226,14 @@ config NET_TCPPROBE To compile this code as a module, choose M here: the module will be called tcp_probe. +config SUPPORT_SEND_BAD_CRC + bool "Support Send Bad CRC (USE WITH CAUTION)" + ---help--- + When enabled, one can send a specially crafted packet to the ethernet + device via a raw socket and it will be sent with the last 4 bytes of + the packet as the ethernet CRC. Requires driver support. Current driver + support is limited to e100 and e1000. + endmenu endmenu diff --git a/net/Makefile b/net/Makefile index b7a1364..2b3cb93 100644 --- a/net/Makefile +++ b/net/Makefile @@ -50,6 +50,7 @@ obj-$(CONFIG_MAC80211) += mac80211/ obj-$(CONFIG_IEEE80211) += ieee80211/ obj-$(CONFIG_TIPC) += tipc/ obj-$(CONFIG_NETLABEL) += netlabel/ +obj-$(CONFIG_REDIRDEV) += redir/ obj-$(CONFIG_IUCV) += iucv/ obj-$(CONFIG_RFKILL) += rfkill/ obj-$(CONFIG_NET_9P) += 9p/ diff --git a/net/core/dev.c b/net/core/dev.c index bd08aa7..ea07825 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include #include @@ -119,9 +120,26 @@ #include #include #include +#include #include "net-sysfs.h" +#if defined(CONFIG_NET_PKTGEN) || defined(CONFIG_NET_PKTGEN_MODULE) +#include "pktgen.h" + +#warning "Compiling dev.c for pktgen."; + +int (*handle_pktgen_hook)(struct sk_buff *skb) = NULL; +EXPORT_SYMBOL(handle_pktgen_hook); + +static __inline__ int handle_pktgen_rcv(struct sk_buff* skb) { + if (handle_pktgen_hook) { + return handle_pktgen_hook(skb); + } + return -1; +} +#endif + /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -2047,6 +2065,11 @@ int netif_receive_skb(struct sk_buff *skb) skb_reset_transport_header(skb); skb->mac_len = skb->network_header - skb->mac_header; + /* Set the default 'mark' for this skb. dflt_skb_mark may be set through + * the /sys/class/net/[dev-name]/dflt_skb_mark file. + */ + skb->mark = skb->dev->dflt_skb_mark; + pt_prev = NULL; rcu_read_lock(); @@ -2080,6 +2103,16 @@ ncls: if (!skb) goto out; +#if defined(CONFIG_NET_PKTGEN) || defined(CONFIG_NET_PKTGEN_MODULE) + if ((skb->dev->pkt_dev) && + (handle_pktgen_rcv(skb) >= 0)) { + /* Pktgen may consume the packet, no need to send + * to further protocols. + */ + goto out; + } +#endif + type = skb->protocol; list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { @@ -3855,6 +3888,7 @@ EXPORT_SYMBOL(register_netdev); static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; + int n = 0; rebroadcast_time = warning_time = jiffies; while (atomic_read(&dev->refcnt) != 0) { @@ -3888,10 +3922,37 @@ static void netdev_wait_allrefs(struct net_device *dev) "count = %d\n", dev->name, atomic_read(&dev->refcnt)); warning_time = jiffies; + n++; } + if (n >= 3) + break; } } +#ifdef CONFIG_DEBUG_NETDEV +/* This is for debugging reference counting of devices */ +int netdev_debug __read_mostly; + +void __dev_hold(struct net_device *dev, const char *func) +{ + atomic_inc(&dev->refcnt); + if (unlikely(netdev_debug) && (strcmp(dev->name, "rddVR0") == 0)) + printk(KERN_DEBUG "%s: dev_hold %d %s\n", + dev->name, atomic_read(&dev->refcnt), func); +} +EXPORT_SYMBOL(__dev_hold); + +void __dev_put(struct net_device *dev, const char *func) +{ + BUG_ON(atomic_read(&dev->refcnt) == 0); + if (unlikely(netdev_debug) && (strcmp(dev->name, "rddVR0") == 0)) + printk(KERN_DEBUG "%s: dev_put %d %s\n", + dev->name, atomic_read(&dev->refcnt), func); + atomic_dec(&dev->refcnt); +} +EXPORT_SYMBOL(__dev_put); +#endif + /* The sequence is: * * rtnl_lock(); @@ -3942,7 +4003,7 @@ void netdev_run_todo(void) netdev_wait_allrefs(dev); /* paranoia */ - BUG_ON(atomic_read(&dev->refcnt)); + WARN_ON(atomic_read(&dev->refcnt)); BUG_TRAP(!dev->ip_ptr); BUG_TRAP(!dev->ip6_ptr); BUG_TRAP(!dev->dn_ptr); @@ -4569,6 +4630,10 @@ EXPORT_SYMBOL(net_enable_timestamp); EXPORT_SYMBOL(net_disable_timestamp); EXPORT_SYMBOL(dev_get_flags); +#if defined(CONFIG_NET_PKTGEN) || defined(CONFIG_NET_PKTGEN_MODULE) +EXPORT_SYMBOL(handle_pktgen_rcv); +#endif + #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) EXPORT_SYMBOL(br_handle_frame_hook); EXPORT_SYMBOL(br_fdb_get_hook); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 1163eb2..ab7cc33 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1,4 +1,4 @@ -/* +/* -*- linux-c -*- * net/core/ethtool.c - Ethtool ioctl handler * Copyright (c) 2003 Matthew Wilcox * @@ -35,6 +35,12 @@ u32 ethtool_op_get_tx_csum(struct net_device *dev) return (dev->features & NETIF_F_ALL_CSUM) != 0; } +u32 ethtool_op_get_rx_all(struct net_device *dev, u32* retval) +{ + *retval = ((dev->priv_flags & IFF_ACCEPT_ALL_FRAMES) != 0); + return 0; +} + int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) { if (data) @@ -777,6 +783,88 @@ static int ethtool_set_value(struct net_device *dev, char __user *useraddr, return actor(dev, edata.data); } + +static int ethtool_get_rx_all(struct net_device *dev, char *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GSG }; + int rv = 0; + + if (!dev->ethtool_ops->get_rx_all) + return -EOPNOTSUPP; + + if ((rv = dev->ethtool_ops->get_rx_all(dev, &edata.data)) < 0) { + return rv; + } + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + + +static int ethtool_set_rx_all(struct net_device *dev, void *useraddr) +{ + struct ethtool_value id; + + if (!dev->ethtool_ops->set_rx_all) + return -EOPNOTSUPP; + + if (copy_from_user(&id, useraddr, sizeof(id))) + return -EFAULT; + + return dev->ethtool_ops->set_rx_all(dev, id.data); +} + +static int ethtool_get_rx_fcs(struct net_device *dev, char *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GSG }; + int rv = 0; + + if (!dev->ethtool_ops->get_save_fcs) + return -EOPNOTSUPP; + + if ((rv = dev->ethtool_ops->get_save_fcs(dev, &edata.data)) < 0) { + return rv; + } + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + + +static int ethtool_set_rx_fcs(struct net_device *dev, void *useraddr) +{ + struct ethtool_value id; + + if (!dev->ethtool_ops->set_save_fcs) + return -EOPNOTSUPP; + + if (copy_from_user(&id, useraddr, sizeof(id))) + return -EFAULT; + + return dev->ethtool_ops->set_save_fcs(dev, id.data); +} + + +/* Handle some generic ethtool commands here */ +static int ethtool_get_netdev_stats(struct net_device *dev, void *useraddr) { + + struct ethtool_ndstats* nds = (struct ethtool_ndstats*)(useraddr); + + struct net_device_stats *stats = dev->get_stats(dev); + if (stats) { + if (copy_to_user(nds->data, stats, sizeof(*stats))) { + return -EFAULT; + } + } + else { + return -EOPNOTSUPP; + } + return 0; +} + + /* The main entry point in this file. Called from net/core/dev.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) @@ -790,9 +878,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (!dev || !netif_device_present(dev)) return -ENODEV; - if (!dev->ethtool_ops) - return -EOPNOTSUPP; - if (copy_from_user(ðcmd, useraddr, sizeof (ethcmd))) return -EFAULT; @@ -819,12 +904,25 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) return -EPERM; } - if (dev->ethtool_ops->begin) + if (dev->ethtool_ops && dev->ethtool_ops->begin) if ((rc = dev->ethtool_ops->begin(dev)) < 0) return rc; old_features = dev->features; + /* Handle some generic operations that do not require specific + * ethtool handlers. + */ + switch (ethcmd) { + case ETHTOOL_GNDSTATS: + return ethtool_get_netdev_stats(dev, useraddr); + default: + break; + } + + if (!dev->ethtool_ops) + return -EOPNOTSUPP; + switch (ethcmd) { case ETHTOOL_GSET: rc = ethtool_get_settings(dev, useraddr); @@ -927,6 +1025,18 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_PHYS_ID: rc = ethtool_phys_id(dev, useraddr); break; + case ETHTOOL_SETRXALL: + rc = ethtool_set_rx_all(dev, useraddr); + break; + case ETHTOOL_GETRXALL: + rc = ethtool_get_rx_all(dev, useraddr); + break; + case ETHTOOL_SETRXFCS: + rc = ethtool_set_rx_fcs(dev, useraddr); + break; + case ETHTOOL_GETRXFCS: + rc = ethtool_get_rx_fcs(dev, useraddr); + break; case ETHTOOL_GSTATS: rc = ethtool_get_stats(dev, useraddr); break; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 19b8e00..ec1f048 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -765,6 +765,13 @@ static __inline__ int neigh_max_probes(struct neighbour *n) p->ucast_probes + p->app_probes + p->mcast_probes); } +static unsigned long neigh_rand_retry(struct neighbour* neigh) { + if (neigh->parms->retrans_rand_backoff) { + return net_random() % neigh->parms->retrans_rand_backoff; + } + return 0; +} + /* Called when a timer expires for a neighbour entry. */ static void neigh_timer_handler(unsigned long arg) @@ -820,11 +827,11 @@ static void neigh_timer_handler(unsigned long arg) neigh->nud_state = NUD_PROBE; neigh->updated = jiffies; atomic_set(&neigh->probes, 0); - next = now + neigh->parms->retrans_time; + next = now + neigh->parms->retrans_time + neigh_rand_retry(neigh); } } else { /* NUD_PROBE|NUD_INCOMPLETE */ - next = now + neigh->parms->retrans_time; + next = now + neigh->parms->retrans_time + neigh_rand_retry(neigh); } if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -2642,6 +2649,14 @@ static struct neigh_sysctl_table { .strategy = &sysctl_ms_jiffies, }, { + .ctl_name = NET_NEIGH_RETRANS_RAND_BACKOFF, + .procname = "retrans_rand_backoff_ms", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies, + }, + { .ctl_name = NET_NEIGH_GC_INTERVAL, .procname = "gc_interval", .maxlen = sizeof(int), @@ -2712,18 +2727,19 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, t->neigh_vars[11].data = &p->locktime; t->neigh_vars[12].data = &p->retrans_time; t->neigh_vars[13].data = &p->base_reachable_time; + t->neigh_vars[14].data = &p->retrans_rand_backoff; if (dev) { dev_name_source = dev->name; neigh_path[NEIGH_CTL_PATH_DEV].ctl_name = dev->ifindex; /* Terminate the table early */ - memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14])); + memset(&t->neigh_vars[15], 0, sizeof(t->neigh_vars[14])); } else { dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname; - t->neigh_vars[14].data = (int *)(p + 1); - t->neigh_vars[15].data = (int *)(p + 1) + 1; - t->neigh_vars[16].data = (int *)(p + 1) + 2; - t->neigh_vars[17].data = (int *)(p + 1) + 3; + t->neigh_vars[15].data = (int *)(p + 1); + t->neigh_vars[16].data = (int *)(p + 1) + 1; + t->neigh_vars[17].data = (int *)(p + 1) + 2; + t->neigh_vars[18].data = (int *)(p + 1) + 3; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7635d3f..f24b2ac 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -208,6 +208,20 @@ static ssize_t store_tx_queue_len(struct device *dev, return netdev_store(dev, attr, buf, len, change_tx_queue_len); } +NETDEVICE_SHOW(dflt_skb_mark, fmt_ulong); + +static int change_dflt_skb_mark(struct net_device *net, unsigned long new_val) +{ + net->dflt_skb_mark = new_val; + return 0; +} + +static ssize_t store_dflt_skb_mark(struct device *dev, struct device_attribute* attr, + const char* buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_dflt_skb_mark); +} + static struct device_attribute net_class_attributes[] = { __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), __ATTR(iflink, S_IRUGO, show_iflink, NULL), @@ -224,6 +238,8 @@ static struct device_attribute net_class_attributes[] = { __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags), __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, store_tx_queue_len), + __ATTR(dflt_skb_mark, S_IRUGO | S_IWUSR, show_dflt_skb_mark, + store_dflt_skb_mark), {} }; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 20e63b3..3ceb0f4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -6,7 +6,7 @@ * * Alexey Kuznetsov * Ben Greear - * Jens Låås + * Jens Låås * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -167,233 +167,110 @@ #include #include /* do_div */ #include - +#include /* sched_clock() */ +#include "pktgen.h" + +#define USE_NQW_CALLBACK +#ifdef USE_NQW_CALLBACK +# include +# if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) +# include +# endif +#endif #define VERSION "pktgen v2.69: Packet Generator for packet performance testing.\n" -#define IP_NAME_SZ 32 -#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ -#define MPLS_STACK_BOTTOM htonl(0x00000100) - -/* Device flag bits */ -#define F_IPSRC_RND (1<<0) /* IP-Src Random */ -#define F_IPDST_RND (1<<1) /* IP-Dst Random */ -#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */ -#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */ -#define F_MACSRC_RND (1<<4) /* MAC-Src Random */ -#define F_MACDST_RND (1<<5) /* MAC-Dst Random */ -#define F_TXSIZE_RND (1<<6) /* Transmit size is random */ -#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */ -#define F_MPLS_RND (1<<8) /* Random MPLS labels */ -#define F_VID_RND (1<<9) /* Random VLAN ID */ -#define F_SVID_RND (1<<10) /* Random SVLAN ID */ -#define F_FLOW_SEQ (1<<11) /* Sequential flows */ -#define F_IPSEC_ON (1<<12) /* ipsec on for flows */ -#define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ - -/* Thread control flag bits */ -#define T_TERMINATE (1<<0) -#define T_STOP (1<<1) /* Stop run */ -#define T_RUN (1<<2) /* Start run */ -#define T_REMDEVALL (1<<3) /* Remove all devs */ -#define T_REMDEV (1<<4) /* Remove one dev */ - -/* If lock -- can be removed after some work */ -#define if_lock(t) spin_lock(&(t->if_lock)); -#define if_unlock(t) spin_unlock(&(t->if_lock)); - -/* Used to help with determining the pkts on receive */ -#define PKTGEN_MAGIC 0xbe9be955 -#define PG_PROC_DIR "pktgen" -#define PGCTRL "pgctrl" static struct proc_dir_entry *pg_proc_dir = NULL; -#define MAX_CFLOWS 65536 - -#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4) -#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4) - -struct flow_state { - __be32 cur_daddr; - int count; -#ifdef CONFIG_XFRM - struct xfrm_state *x; -#endif - __u32 flags; -}; - -/* flow flag bits */ -#define F_INIT (1<<0) /* flow has been initialized */ - -struct pktgen_dev { - /* - * Try to keep frequent/infrequent used vars. separated. - */ - struct proc_dir_entry *entry; /* proc file */ - struct pktgen_thread *pg_thread;/* the owner */ - struct list_head list; /* Used for chaining in the thread's run-queue */ - - int running; /* if this changes to false, the test will stop */ - - /* If min != max, then we will either do a linear iteration, or - * we will do a random selection from within the range. - */ - __u32 flags; - int removal_mark; /* non-zero => the device is marked for - * removal by worker thread */ - - int min_pkt_size; /* = ETH_ZLEN; */ - int max_pkt_size; /* = ETH_ZLEN; */ - int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ - int nfrags; - __u32 delay_us; /* Default delay */ - __u32 delay_ns; - __u64 count; /* Default No packets to send */ - __u64 sofar; /* How many pkts we've sent so far */ - __u64 tx_bytes; /* How many bytes we've transmitted */ - __u64 errors; /* Errors when trying to transmit, pkts will be re-sent */ - - /* runtime counters relating to clone_skb */ - __u64 next_tx_us; /* timestamp of when to tx next */ - __u32 next_tx_ns; - - __u64 allocated_skbs; - __u32 clone_count; - int last_ok; /* Was last skb sent? - * Or a failed transmit of some sort? This will keep - * sequence numbers in order, for example. - */ - __u64 started_at; /* micro-seconds */ - __u64 stopped_at; /* micro-seconds */ - __u64 idle_acc; /* micro-seconds */ - __u32 seq_num; - - int clone_skb; /* Use multiple SKBs during packet gen. If this number - * is greater than 1, then that many copies of the same - * packet will be sent before a new packet is allocated. - * For instance, if you want to send 1024 identical packets - * before creating a new packet, set clone_skb to 1024. - */ - - char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ - char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ - char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ - char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ - - struct in6_addr in6_saddr; - struct in6_addr in6_daddr; - struct in6_addr cur_in6_daddr; - struct in6_addr cur_in6_saddr; - /* For ranges */ - struct in6_addr min_in6_daddr; - struct in6_addr max_in6_daddr; - struct in6_addr min_in6_saddr; - struct in6_addr max_in6_saddr; - - /* If we're doing ranges, random or incremental, then this - * defines the min/max for those ranges. - */ - __be32 saddr_min; /* inclusive, source IP address */ - __be32 saddr_max; /* exclusive, source IP address */ - __be32 daddr_min; /* inclusive, dest IP address */ - __be32 daddr_max; /* exclusive, dest IP address */ - - __u16 udp_src_min; /* inclusive, source UDP port */ - __u16 udp_src_max; /* exclusive, source UDP port */ - __u16 udp_dst_min; /* inclusive, dest UDP port */ - __u16 udp_dst_max; /* exclusive, dest UDP port */ - - /* DSCP + ECN */ - __u8 tos; /* six most significant bits of (former) IPv4 TOS are for dscp codepoint */ - __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 (see RFC 3260, sec. 4) */ - - /* MPLS */ - unsigned nr_labels; /* Depth of stack, 0 = no MPLS */ - __be32 labels[MAX_MPLS_LABELS]; - - /* VLAN/SVLAN (802.1Q/Q-in-Q) */ - __u8 vlan_p; - __u8 vlan_cfi; - __u16 vlan_id; /* 0xffff means no vlan tag */ - - __u8 svlan_p; - __u8 svlan_cfi; - __u16 svlan_id; /* 0xffff means no svlan tag */ - - __u32 src_mac_count; /* How many MACs to iterate through */ - __u32 dst_mac_count; /* How many MACs to iterate through */ - - unsigned char dst_mac[ETH_ALEN]; - unsigned char src_mac[ETH_ALEN]; - - __u32 cur_dst_mac_offset; - __u32 cur_src_mac_offset; - __be32 cur_saddr; - __be32 cur_daddr; - __u16 cur_udp_dst; - __u16 cur_udp_src; - __u16 cur_queue_map; - __u32 cur_pkt_size; - - __u8 hh[14]; - /* = { - 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, - - We fill in SRC address later - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x08, 0x00 - }; - */ - __u16 pad; /* pad out the hh struct to an even 16 bytes */ +#define REMOVE 1 +#define FIND 0 - struct sk_buff *skb; /* skb we are to transmit next, mainly used for when we - * are transmitting the same one multiple times - */ - struct net_device *odev; /* The out-going device. Note that the device should - * have it's pg_info pointer pointing back to this - * device. This will be set when the user specifies - * the out-going device name (not when the inject is - * started as it used to do.) - */ - struct flow_state *flows; - unsigned cflows; /* Concurrent flows (config) */ - unsigned lflow; /* Flow length (config) */ - unsigned nflows; /* accumulated flows (stats) */ - unsigned curfl; /* current sequenced flow (state)*/ - - u16 queue_map_min; - u16 queue_map_max; +static char* version = VERSION; -#ifdef CONFIG_XFRM - __u8 ipsmode; /* IPSEC mode (config) */ - __u8 ipsproto; /* IPSEC type (config) */ -#endif - char result[512]; +static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove); +static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i); +static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); +static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, const char *ifname); +static int pktgen_device_event(struct notifier_block *, unsigned long, void *); +static void pktgen_run_all_threads(int background); +static void pktgen_stop_all_threads_ifs(void); +static int pktgen_stop_device(struct pktgen_dev *pkt_dev); +static void pktgen_stop(struct pktgen_thread *t); +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev, int seq_too); +static void pktgen_mark_device(const char *ifname); +static unsigned int scan_ip6(const char *s, char ip[16]); +static unsigned int fmt_ip6(char *s, const char ip[16]); +static void clear_nqw_hook(struct pktgen_thread* t, struct net_device* dev); +static int set_nqw_hook(struct pktgen_thread* t, struct net_device* dev, int gfp); + +/* Module parameters, defaults. */ +static int pg_count_d = 1000; /* 1000 pkts by default */ +static int pg_delay_d = 0x7FFFFFFF; /* Don't run until someone sets a different delay. */ + +static int pg_clone_skb_d; +static int debug; + +static DEFINE_MUTEX(pktgen_thread_lock); +static LIST_HEAD(pktgen_threads); + +static struct notifier_block pktgen_notifier_block = { + .notifier_call = pktgen_device_event, }; + +/* This code works around the fact that do_div cannot handle two 64-bit + numbers, and regular 64-bit division doesn't work on x86 kernels. + --Ben +*/ -struct pktgen_hdr { - __be32 pgh_magic; - __be32 seq_num; - __be32 tv_sec; - __be32 tv_usec; -}; +#define PG_DIV 0 -struct pktgen_thread { - spinlock_t if_lock; - struct list_head if_list; /* All device here */ - struct list_head th_list; - struct task_struct *tsk; - char result[512]; +/* This was emailed to LMKL by: Chris Caputo + * Function copied/adapted/optimized from: + * + * nemesis.sourceforge.net/browse/lib/static/intmath/ix86/intmath.c.html + * + * Copyright 1994, University of Cambridge Computer Laboratory + * All Rights Reserved. + * + */ +static inline s64 divremdi3(s64 x, s64 y, int type) +{ + u64 a = (x < 0) ? -x : x; + u64 b = (y < 0) ? -y : y; + u64 res = 0, d = 1; + + if (b > 0) { + while (b < a) { + b <<= 1; + d <<= 1; + } + } - /* Field for thread to receive "posted" events terminate, stop ifs etc. */ + do { + if (a >= b) { + a -= b; + res += d; + } + b >>= 1; + d >>= 1; + } + while (d); - u32 control; - int cpu; + if (PG_DIV == type) { + return (((x ^ y) & (1ll << 63)) == 0) ? res : -(s64) res; + } else { + return ((x & (1ll << 63)) == 0) ? a : -(s64) a; + } +} - wait_queue_head_t queue; -}; +/* End of hacks to deal with 64-bit math on x86 */ -#define REMOVE 1 -#define FIND 0 +/** Convert to milliseconds */ +static inline __u64 tv_to_ms(const struct timeval *tv) +{ + __u64 ms = tv->tv_usec / 1000; + ms += (__u64) tv->tv_sec * (__u64) 1000; + return ms; +} /** Convert to micro-seconds */ static inline __u64 tv_to_us(const struct timeval *tv) @@ -403,43 +280,65 @@ static inline __u64 tv_to_us(const struct timeval *tv) return us; } -static __u64 getCurUs(void) +/** Convert to micro-seconds */ +static inline __u64 ts_to_us(const struct timespec *ts) { - struct timeval tv; - do_gettimeofday(&tv); - return tv_to_us(&tv); + __u64 us = ts->tv_nsec / NSEC_PER_USEC; + us += ((__u64) ts->tv_sec) * 1000000ULL; + return us; } -/* old include end */ - -static char version[] __initdata = VERSION; +#if 0 +static inline __u64 pg_div(__u64 n, __u32 base) +{ + __u64 tmp = n; + do_div(tmp, base); + /* printk("pktgen: pg_div, n: %llu base: %d rv: %llu\n", + n, base, tmp); */ + return tmp; +} -static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i); -static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); -static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, - const char *ifname); -static int pktgen_device_event(struct notifier_block *, unsigned long, void *); -static void pktgen_run_all_threads(void); -static void pktgen_stop_all_threads_ifs(void); -static int pktgen_stop_device(struct pktgen_dev *pkt_dev); -static void pktgen_stop(struct pktgen_thread *t); -static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); +static inline __u64 pg_div64(__u64 n, __u64 base) +{ + __u64 tmp = n; +/* + * How do we know if the architecture we are running on + * supports division with 64 bit base? + * + */ +#if defined(__sparc_v9__) || defined(__powerpc64__) || defined(__alpha__) || defined(__x86_64__) || defined(__ia64__) -static unsigned int scan_ip6(const char *s, char ip[16]); -static unsigned int fmt_ip6(char *s, const char ip[16]); + do_div(tmp, base); +#else + tmp = divremdi3(n, base, PG_DIV); +#endif + return tmp; +} +#endif -/* Module parameters, defaults. */ -static int pg_count_d = 1000; /* 1000 pkts by default */ -static int pg_delay_d; -static int pg_clone_skb_d; -static int debug; +static inline __u64 getCurUs(void) +{ + struct timespec ts; + getnstimeofday(&ts); + return ts_to_us(&ts); +} -static DEFINE_MUTEX(pktgen_thread_lock); -static LIST_HEAD(pktgen_threads); +static inline __u64 tv_diff(const struct timeval *a, const struct timeval *b) +{ + return tv_to_us(a) - tv_to_us(b); +} + +/* old include end */ +static __u64 getRelativeCurNs(void) { + /* Seems you must disable pre-empt to call sched_clock. --Ben */ + unsigned long flags; + __u64 rv; + local_irq_save(flags); + rv = sched_clock(); + local_irq_restore(flags); + return rv; +} -static struct notifier_block pktgen_notifier_block = { - .notifier_call = pktgen_device_event, -}; /* * /proc handling functions @@ -448,7 +347,7 @@ static struct notifier_block pktgen_notifier_block = { static int pgctrl_show(struct seq_file *seq, void *v) { - seq_puts(seq, VERSION); + seq_puts(seq, version); return 0; } @@ -476,8 +375,10 @@ static ssize_t pgctrl_write(struct file *file, const char __user * buf, pktgen_stop_all_threads_ifs(); else if (!strcmp(data, "start")) - pktgen_run_all_threads(); - + pktgen_run_all_threads(0); + /* Run in the background. */ + else if (!strcmp(data, "bg_start")) + pktgen_run_all_threads(1); else printk(KERN_WARNING "pktgen: Unknown command: %s\n", data); @@ -492,6 +393,137 @@ static int pgctrl_open(struct inode *inode, struct file *file) return single_open(file, pgctrl_show, PDE(inode)->data); } +static int pg_populate_report(struct pktgen_dev_report* rpt, struct pktgen_dev* pkt_dev) { + int i; + + memset(rpt, 0, sizeof(*rpt)); + rpt->api_version = 1; + rpt->flags = pkt_dev->flags; + strncpy(rpt->thread_name, pkt_dev->pg_thread->tsk->comm, 32); + strncpy(rpt->interface_name, pkt_dev->ifname, 32); + rpt->min_pkt_size = pkt_dev->min_pkt_size; + rpt->max_pkt_size = pkt_dev->max_pkt_size; + rpt->clone_skb = pkt_dev->clone_skb; + rpt->peer_clone_skb = pkt_dev->peer_clone_skb; + rpt->nfrags = pkt_dev->nfrags; + + strncpy(rpt->dst_min, pkt_dev->dst_min, IP_NAME_SZ); + strncpy(rpt->dst_max, pkt_dev->dst_max, IP_NAME_SZ); + strncpy(rpt->src_min, pkt_dev->src_min, IP_NAME_SZ); + strncpy(rpt->src_max, pkt_dev->src_max, IP_NAME_SZ); + + memcpy(&rpt->in6_saddr, &pkt_dev->in6_saddr, sizeof(struct in6_addr)); + memcpy(&rpt->in6_daddr, &pkt_dev->in6_daddr, sizeof(struct in6_addr)); + + /* For ranges */ + memcpy(&rpt->min_in6_daddr, &pkt_dev->min_in6_daddr, sizeof(struct in6_addr)); + memcpy(&rpt->max_in6_daddr, &pkt_dev->max_in6_daddr, sizeof(struct in6_addr)); + memcpy(&rpt->min_in6_saddr, &pkt_dev->min_in6_saddr, sizeof(struct in6_addr)); + memcpy(&rpt->max_in6_saddr, &pkt_dev->max_in6_saddr, sizeof(struct in6_addr)); + + /* If we're doing ranges, random or incremental, then this + * defines the min/max for those ranges. + */ + rpt->saddr_min = pkt_dev->saddr_min; + rpt->saddr_max = pkt_dev->saddr_max; + rpt->daddr_min = pkt_dev->daddr_min; + rpt->daddr_max = pkt_dev->daddr_max; + + rpt->udp_src_min = pkt_dev->udp_src_min; + rpt->udp_src_max = pkt_dev->udp_src_max; + rpt->udp_dst_min = pkt_dev->udp_dst_min; + rpt->udp_dst_max = pkt_dev->udp_dst_max; + + /* MPLS */ + rpt->nr_labels = pkt_dev->nr_labels; /* Depth of stack, 0 = no MPLS */ + for (i = 0; ilabels[i] = pkt_dev->labels[i]; + } + + rpt->src_mac_count = pkt_dev->src_mac_count; + rpt->dst_mac_count = pkt_dev->dst_mac_count; + + memcpy(&rpt->dst_mac, &pkt_dev->dst_mac, ETH_ALEN); + memcpy(&rpt->src_mac, &pkt_dev->src_mac, ETH_ALEN); + + rpt->nflows = pkt_dev->nflows; + rpt->cflows = pkt_dev->cflows; + rpt->lflow = pkt_dev->lflow; + + rpt->delay_ns = pkt_dev->delay_ns; + rpt->count = pkt_dev->count; /* Default No packets to send */ + rpt->sofar = pkt_dev->sofar; /* How many pkts we've sent so far */ + rpt->tx_bytes = pkt_dev->tx_bytes; /* How many bytes we've transmitted */ + rpt->errors = pkt_dev->errors; /* Errors when trying to transmit, pkts will be re-sent */ + + /* Fields relating to receiving pkts */ + rpt->avg_latency = pkt_dev->avg_latency; /* in micro-seconds */ + rpt->min_latency = pkt_dev->min_latency; + rpt->max_latency = pkt_dev->max_latency; + for (i = 0; ilatency_bkts[i] = pkt_dev->latency_bkts[i]; + } + rpt->pkts_rcvd_since_clear = pkt_dev->pkts_rcvd_since_clear; + + rpt->ooo_rcvd = pkt_dev->ooo_rcvd; + rpt->pkts_rcvd = pkt_dev->pkts_rcvd; + rpt->dup_rcvd = pkt_dev->dup_rcvd; + rpt->bytes_rcvd = pkt_dev->bytes_rcvd; + rpt->seq_gap_rcvd = pkt_dev->seq_gap_rcvd; + rpt->non_pg_pkts_rcvd = pkt_dev->non_pg_pkts_rcvd; + return 0; +}; /* populate report */ + + +int pktgen_proc_ioctl(struct inode* inode, struct file* file, unsigned int cmd, + unsigned long arg) { + int err = 0; + struct pktgen_ioctl_info args; + struct pktgen_dev* pkt_dev = NULL; + + if (copy_from_user(&args, (void*)arg, sizeof(args))) { + return -EFAULT; + } + + /* Null terminate the names */ + args.thread_name[31] = 0; + args.interface_name[31] = 0; + + /* printk("pktgen: thread_name: %s interface_name: %s\n", + * args.thread_name, args.interface_name); + */ + + switch (cmd) { + case GET_PKTGEN_INTERFACE_INFO: { + mutex_lock(&pktgen_thread_lock); + pkt_dev = __pktgen_NN_threads(args.interface_name, FIND); + if (pkt_dev) { + pg_populate_report(&(args.report), pkt_dev); + if (copy_to_user((void*)(arg), &args, sizeof(args))) { + printk("ERROR: pktgen: copy_to_user failed.\n"); + err = -EFAULT; + } + else { + err = 0; + } + } + else { + printk("ERROR: pktgen: Could not find interface -:%s:-\n", + args.interface_name); + err = -ENODEV; + } + mutex_unlock(&pktgen_thread_lock); + break; + } + default: + printk("%s: Unknown pktgen IOCTL: %x \n", __FUNCTION__, + cmd); + return -EINVAL; + } + + return err; +}/* pktgen_proc_ioctl */ + static const struct file_operations pktgen_fops = { .owner = THIS_MODULE, .open = pgctrl_open, @@ -499,6 +531,7 @@ static const struct file_operations pktgen_fops = { .llseek = seq_lseek, .write = pgctrl_write, .release = single_release, + .ioctl = pktgen_proc_ioctl, }; static int pktgen_if_show(struct seq_file *seq, void *v) @@ -514,11 +547,12 @@ static int pktgen_if_show(struct seq_file *seq, void *v) (unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size, pkt_dev->max_pkt_size); - seq_printf(seq, - " frags: %d delay: %u clone_skb: %d ifname: %s\n", - pkt_dev->nfrags, - 1000 * pkt_dev->delay_us + pkt_dev->delay_ns, - pkt_dev->clone_skb, pkt_dev->odev->name); + seq_printf(seq, + " frags: %d delay: %lluns clone_skb: %d peer_clone_skb: %d ifname: %s\n", + pkt_dev->nfrags, + (unsigned long long)pkt_dev->delay_ns, + pkt_dev->clone_skb, pkt_dev->peer_clone_skb, + pkt_dev->ifname); seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows, pkt_dev->lflow); @@ -528,6 +562,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) pkt_dev->queue_map_min, pkt_dev->queue_map_max); + if (pkt_dev->flags & F_IPV6) { char b1[128], b2[128], b3[128]; fmt_ip6(b1, pkt_dev->in6_saddr.s6_addr); @@ -652,11 +687,32 @@ static int pktgen_if_show(struct seq_file *seq, void *v) stopped = now; /* not really stopped, more like last-running-at */ seq_printf(seq, - "Current:\n pkts-sofar: %llu errors: %llu\n started: %lluus stopped: %lluus idle: %lluus\n", + "Current:\n tx-pkts: %llu tx-errors: %llu tx-bytes: %llu\n", (unsigned long long)pkt_dev->sofar, - (unsigned long long)pkt_dev->errors, (unsigned long long)sa, + (unsigned long long)pkt_dev->errors, + (unsigned long long)pkt_dev->tx_bytes); + seq_printf(seq, + " rx-pkts: %llu rx-bytes: %llu alloc_skbs: %llu oom_alloc_skbs: %llu\n", + (unsigned long long)pkt_dev->pkts_rcvd, + (unsigned long long)pkt_dev->bytes_rcvd, + (unsigned long long)pkt_dev->allocated_skbs, + (unsigned long long)pkt_dev->oom_on_alloc_skb); + + + seq_printf(seq, + " blocked: %s next-tx-ns: %llu (%lli) started: %lluus stopped: %lluus idle: %lluns\n", + pkt_dev->tx_blocked ? "TRUE" : "false", + (unsigned long long)pkt_dev->next_tx_ns, + (long long)(pkt_dev->next_tx_ns - getRelativeCurNs()), + (unsigned long long)sa, (unsigned long long)stopped, - (unsigned long long)pkt_dev->idle_acc); + (unsigned long long)pkt_dev->idle_acc_ns); + seq_printf(seq, + " nanodelays: %llu sleeps: %llu queue_stopped: %llu tx-early: %llu\n", + (unsigned long long)pkt_dev->nanodelays, + (unsigned long long)pkt_dev->sleeps, + (unsigned long long)pkt_dev->queue_stopped, + (unsigned long long)pkt_dev->req_tx_early); seq_printf(seq, " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n", @@ -676,7 +732,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src); seq_printf(seq, " cur_queue_map: %u\n", pkt_dev->cur_queue_map); - + seq_printf(seq, " flows: %u\n", pkt_dev->nflows); if (pkt_dev->result[0]) @@ -942,15 +998,11 @@ static ssize_t pktgen_if_write(struct file *file, return len; } i += len; - if (value == 0x7FFFFFFF) { - pkt_dev->delay_us = 0x7FFFFFFF; - pkt_dev->delay_ns = 0; - } else { - pkt_dev->delay_us = value / 1000; - pkt_dev->delay_ns = value % 1000; + pkt_dev->delay_ns = value; + if ((getRelativeCurNs() + pkt_dev->delay_ns) > pkt_dev->next_tx_ns) { + pkt_dev->next_tx_ns = getRelativeCurNs() + pkt_dev->delay_ns; } - sprintf(pg_result, "OK: delay=%u", - 1000 * pkt_dev->delay_us + pkt_dev->delay_ns); + sprintf(pg_result, "OK: delay=%lluns", (unsigned long long)pkt_dev->delay_ns); return count; } if (!strcmp(name, "udp_src_min")) { @@ -1016,6 +1068,17 @@ static ssize_t pktgen_if_write(struct file *file, sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb); return count; } + if (!strcmp(name, "peer_clone_skb")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { + return len; + } + i += len; + pkt_dev->peer_clone_skb = value; + + sprintf(pg_result, "OK: peer_clone_skb=%d", pkt_dev->peer_clone_skb); + return count; + } if (!strcmp(name, "count")) { len = num_arg(&user_buffer[i], 10, &value); if (len < 0) { @@ -1127,12 +1190,13 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "FLOW_SEQ") == 0) pkt_dev->flags |= F_FLOW_SEQ; - + else if (strcmp(f, "QUEUE_MAP_RND") == 0) pkt_dev->flags |= F_QUEUE_MAP_RND; else if (strcmp(f, "!QUEUE_MAP_RND") == 0) pkt_dev->flags &= ~F_QUEUE_MAP_RND; + #ifdef CONFIG_XFRM else if (strcmp(f, "IPSEC") == 0) pkt_dev->flags |= F_IPSEC_ON; @@ -1142,6 +1206,7 @@ static ssize_t pktgen_if_write(struct file *file, pkt_dev->flags &= ~F_IPV6; else { + printk("pktgen: Flag -:%s:- unknown\n", f); sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", f, @@ -1408,13 +1473,12 @@ static ssize_t pktgen_if_write(struct file *file, /* Set up Src MAC */ if (compare_ether_addr(old_smac, pkt_dev->src_mac)) memcpy(&(pkt_dev->hh[6]), pkt_dev->src_mac, ETH_ALEN); - sprintf(pg_result, "OK: srcmac"); return count; } if (!strcmp(name, "clear_counters")) { - pktgen_clear_counters(pkt_dev); + pktgen_clear_counters(pkt_dev, 0); sprintf(pg_result, "OK: Clearing counters.\n"); return count; } @@ -1468,10 +1532,10 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "mpls")) { unsigned n, cnt; - len = get_labels(&user_buffer[i], pkt_dev); if (len < 0) return len; + i += len; cnt = sprintf(pg_result, "OK: mpls="); for (n = 0; n < pkt_dev->nr_labels; n++) @@ -1635,6 +1699,7 @@ static ssize_t pktgen_if_write(struct file *file, return count; } + printk("pktgen: No such parameter \"%s\"\n", name); sprintf(pkt_dev->result, "No such parameter \"%s\"", name); return -EINVAL; } @@ -1651,6 +1716,7 @@ static const struct file_operations pktgen_if_fops = { .llseek = seq_lseek, .write = pktgen_if_write, .release = single_release, + .ioctl = pktgen_proc_ioctl, }; static int pktgen_thread_show(struct seq_file *seq, void *v) @@ -1660,9 +1726,15 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) BUG_ON(!t); + mutex_lock(&pktgen_thread_lock); + /* versioning info. CFG_RT means we do not busy-spin, so can be configured for + * real-time scheduling if user-space so desires. */ + seq_printf(seq, "VERSION-2 CFG_RT\n"); + seq_printf(seq, "PID: %d Name: %s\n", + t->pid, t->tsk->comm); + seq_printf(seq, "Running: "); - if_lock(t); list_for_each_entry(pkt_dev, &t->if_list, list) if (pkt_dev->running) seq_printf(seq, "%s ", pkt_dev->odev->name); @@ -1678,8 +1750,7 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) else seq_printf(seq, "\nResult: NA\n"); - if_unlock(t); - + mutex_unlock(&pktgen_thread_lock); return 0; } @@ -1747,29 +1818,55 @@ static ssize_t pktgen_thread_write(struct file *file, return -EFAULT; i += len; mutex_lock(&pktgen_thread_lock); - pktgen_add_device(t, f); + t->control_arg = f; + t->control |= T_ADD_DEV; + while (t->control & T_ADD_DEV) { + schedule_timeout_interruptible(msecs_to_jiffies(10)); + } + t->control_arg = 0; mutex_unlock(&pktgen_thread_lock); ret = count; sprintf(pg_result, "OK: add_device=%s", f); goto out; } + if (!strcmp(name, "rem_device")) { + char f[32]; + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) { + ret = len; + goto out; + } + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; + i += len; + pktgen_mark_device(f); + ret = count; + sprintf(pg_result, "OK: rem_device=%s", f); + goto out; + } + if (!strcmp(name, "rem_device_all")) { mutex_lock(&pktgen_thread_lock); t->control |= T_REMDEVALL; mutex_unlock(&pktgen_thread_lock); - schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */ + while (t->control & T_REMDEVALL) { + schedule_timeout_interruptible(msecs_to_jiffies(10)); + } ret = count; sprintf(pg_result, "OK: rem_device_all"); goto out; } if (!strcmp(name, "max_before_softirq")) { - sprintf(pg_result, "OK: Note! max_before_softirq is obsoleted -- Do not use"); - ret = count; - goto out; - } + ret = count; + sprintf(pg_result, "ERROR: max_before_softirq no longer supported"); + goto out; + } + printk("pktgen: un-known command to pktgen_thread: -:%s:-\n", name); + ret = -EINVAL; out: return ret; @@ -1787,8 +1884,10 @@ static const struct file_operations pktgen_thread_fops = { .llseek = seq_lseek, .write = pktgen_thread_write, .release = single_release, + .ioctl = pktgen_proc_ioctl, }; + /* Think find or remove for NN */ static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove) { @@ -1799,10 +1898,8 @@ static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove) pkt_dev = pktgen_find_dev(t, ifname); if (pkt_dev) { if (remove) { - if_lock(t); pkt_dev->removal_mark = 1; t->control |= T_REMDEV; - if_unlock(t); } break; } @@ -1896,31 +1993,45 @@ static int pktgen_device_event(struct notifier_block *unused, /* Associate pktgen_dev with a device. */ -static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname) +static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, struct pktgen_thread* t) { struct net_device *odev; int err; /* Clean old setups */ if (pkt_dev->odev) { +#ifdef USE_NQW_CALLBACK + /* Set the nqw callback hooks */ + rtnl_lock(); + clear_nqw_hook(t, pkt_dev->odev); + rtnl_unlock(); +#endif + pkt_dev->odev->pkt_dev = NULL; dev_put(pkt_dev->odev); pkt_dev->odev = NULL; } - odev = dev_get_by_name(&init_net, ifname); + odev = dev_get_by_name(&init_net, pkt_dev->ifname); if (!odev) { - printk(KERN_ERR "pktgen: no such netdevice: \"%s\"\n", ifname); + printk(KERN_ERR "pktgen: no such netdevice: \"%s\"\n", pkt_dev->ifname); return -ENODEV; } if (odev->type != ARPHRD_ETHER) { - printk(KERN_ERR "pktgen: not an ethernet device: \"%s\"\n", ifname); + printk(KERN_ERR "pktgen: not an ethernet device: \"%s\"\n", pkt_dev->ifname); err = -EINVAL; } else if (!netif_running(odev)) { - printk(KERN_ERR "pktgen: device is down: \"%s\"\n", ifname); + printk(KERN_ERR "pktgen: device is down: \"%s\"\n", pkt_dev->ifname); err = -ENETDOWN; } else { pkt_dev->odev = odev; +#ifdef USE_NQW_CALLBACK + /* Set the nqw callback hooks */ + rtnl_lock(); + set_nqw_hook(t, pkt_dev->odev, GFP_ATOMIC); + rtnl_unlock(); +#endif + pkt_dev->odev->pkt_dev = pkt_dev; return 0; } @@ -1933,6 +2044,10 @@ static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname) */ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) { + /* Try once more, just in case it works now. */ + if (!pkt_dev->odev) + pktgen_setup_dev(pkt_dev, pkt_dev->pg_thread); + if (!pkt_dev->odev) { printk(KERN_ERR "pktgen: ERROR: pkt_dev->odev == NULL in " "setup_inject.\n"); @@ -1945,6 +2060,9 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) if (is_zero_ether_addr(pkt_dev->src_mac)) memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, ETH_ALEN); + else + memcpy(&(pkt_dev->hh[6]), pkt_dev->src_mac, ETH_ALEN); + /* Set up Dest MAC */ memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN); @@ -2036,28 +2154,192 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) pkt_dev->nflows = 0; } -static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us) -{ - __u64 start; - __u64 now; - - start = now = getCurUs(); - while (now < spin_until_us) { - /* TODO: optimize sleeping behavior */ - if (spin_until_us - now > jiffies_to_usecs(1) + 1) - schedule_timeout_interruptible(1); - else if (spin_until_us - now > 100) { - if (!pkt_dev->running) - return; - if (need_resched()) - schedule(); + +#ifdef USE_NQW_CALLBACK +/* Runs from interrupt */ +int pg_notify_queue_woken(struct net_device* dev) { + /* Find the thread that needs waking. */ + struct pktgen_thread* t = ((struct pg_nqw_data*)(dev->nqw_data))->pg_thread; + t->control |= T_WAKE_BLOCKED; + wake_up_interruptible(&(t->queue)); + return 0; +} + +/* Must hold RTNL lock while calling this. */ +static int set_nqw_hook(struct pktgen_thread* t, struct net_device* dev, int gfp) { + /* The notify-queue-woken magic only works for physical + * devices at this time. So, apply hook to underlying + * device. + */ + struct pg_nqw_data* nqwd; + ASSERT_RTNL(); + BUG_ON(!t); + + if (!dev) { + WARN_ON(!dev); + return -ENODEV; + } +#if 0 +#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) + if (dev->macvlan_port) { + struct macvlan_dev *vlan = netdev_priv(dev); + printk("pktgen: setting nqw_hook on lower mac-vlan dev: %p\n", vlan->lowerdev); + return set_nqw_hook(t, vlan->lowerdev, gfp); + } +#endif +#endif + + if (dev->priv_flags & IFF_802_1Q_VLAN) { + printk("pktgen: setting nqw_hook on real-dev of .1q vlan: %s\n", dev->name); + return set_nqw_hook(t, vlan_dev_info(dev)->real_dev, gfp); + } + + nqwd = (struct pg_nqw_data*)(dev->nqw_data); + + if (nqwd) { + if (nqwd->magic == PG_NQW_MAGIC) { + if (nqwd->pg_thread == t) { + atomic_inc(&(nqwd->nqw_ref_count)); + + printk("pktgen: Incremented nqw_ref_count: %d device: %s\n", + (int)(atomic_read(&(nqwd->nqw_ref_count))), dev->name); + return 0; + } + else { + printk("pktgen: ERROR: set_nqw_hook: nqwd thread does not match, dev: %s", + dev->name); + return -EINVAL; + } + } + else { + printk("wanlink: WARNING: set_nqw_hook: nqwd magic is NOT WanLink, dev: %s magic: 0x%x", + dev->name, nqwd->magic); + return 0; + } + } + else { + nqwd = kmalloc(sizeof(*nqwd), gfp); + if (nqwd) { + memset(nqwd, 0, sizeof(*nqwd)); + nqwd->magic = PG_NQW_MAGIC; + atomic_inc(&(nqwd->nqw_ref_count)); + nqwd->pg_thread = t; + dev->nqw_data = nqwd; + dev->notify_queue_woken = pg_notify_queue_woken; + printk("pktgen: Added nqw callback to device: %s\n", + dev->name); + return 0; + } + else { + printk("pktgen: ERROR: could not allocate nqwd for dev: %s\n", dev->name); + return -ENOBUFS; } + } +}//set_nqw_hook - now = getCurUs(); + +/* Must hold RTNL lock while calling this. */ +static void clear_nqw_hook(struct pktgen_thread* t, struct net_device* dev) { + /* The notify-queue-woken magic only works for physical + * devices at this time. So, apply hook to underlying + * device. + */ + ASSERT_RTNL(); + BUG_ON(!t); + +#if 0 +#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) + if (dev->macvlan_port) { + struct macvlan_vlan *vlan = dev->priv; + clear_nqw_hook(t, vlan->lowerdev); + return; + } +#endif +#endif + + if (dev->priv_flags & IFF_802_1Q_VLAN) { + clear_nqw_hook(t, vlan_dev_info(dev)->real_dev); + return; } - pkt_dev->idle_acc += now - start; -} + if (dev->nqw_data) { + struct pg_nqw_data* nqwd = (struct pg_nqw_data*)(dev->nqw_data); + if (nqwd->magic == PG_NQW_MAGIC) { + if (t != nqwd->pg_thread) { + printk("pktgen ERROR: t != nqwd->pg_thread\n"); + } + atomic_dec(&(nqwd->nqw_ref_count)); + + printk("pktgen: Decremented nqw_ref_count: %d device: %s\n", + (int)(atomic_read(&(nqwd->nqw_ref_count))), + dev->name); + + BUG_ON(atomic_read(&(nqwd->nqw_ref_count)) < 0); + + if (atomic_read(&(nqwd->nqw_ref_count)) == 0) { + printk("pktgen: Removing nqw reference from device: %s\n", + dev->name); + dev->notify_queue_woken = NULL; + dev->nqw_data = NULL; + kfree(nqwd); + } + } + else { + printk("pktgen: WARNING: clear_nqw_hook: nqwd magic is NOT PKT-GEN, dev: %s magic: 0x%x", + dev->name, nqwd->magic); + } + } + else { + printk("pktgen: Warning: nqw_data is null in clear_nqw_hook, dev: %s\n", + dev->name); + } +}//clear_nqw_hook + +#endif + + +/* delay_ns is in nano-seconds */ +static void pg_nanodelay(u64 delay_ns, struct pktgen_dev* info) { + u64 idle_start = getRelativeCurNs(); + u64 last_time; + u64 _diff; + u64 itmp = idle_start; + struct pktgen_dev *p = NULL; + struct pktgen_thread* t = info->pg_thread; + + info->nanodelays++; + info->accum_delay_ns += delay_ns; + while (info->accum_delay_ns > PG_MAX_ACCUM_DELAY_NS) { + info->sleeps++; + interruptible_sleep_on_timeout(&(t->queue), 1); + /* will wake after one tick */ + last_time = itmp; + + /* Subtract delay from all interfaces for this thread, since all are blocked when + * any are blocked. + */ + itmp = getRelativeCurNs(); + _diff = (itmp - last_time); + list_for_each_entry(p, &t->if_list, list) { + p->accum_delay_ns -= _diff; + /* Limit saving up too much time... */ + if (p->accum_delay_ns < -10000000) { + p->accum_delay_ns = -10000000; + } + } + + /* For accounting, only charge this guy for the idle though...*/ + info->idle_acc_ns += _diff; + + /* break out if we are stopped or if we should transmit (maybe our ipg changed?) */ + if (info->removal_mark || (itmp >= info->next_tx_ns) || + (t->control && T_WAKE_BLOCKED) || + (t->control && T_STOP)) { + break; + } + }/* while */ +}//pg_nanodelay + static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) { @@ -2236,8 +2518,14 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) ntohl(pkt_dev-> saddr_max))) { __u32 t; - if (pkt_dev->flags & F_IPSRC_RND) - t = random32() % (imx - imn) + imn; + if (pkt_dev->flags & F_IPSRC_RND) { + if (imx - imn) { + t = (random32() % (imx - imn)) + imn; + } + else { + t = imn; + } + } else { t = ntohl(pkt_dev->cur_saddr); t++; @@ -2257,16 +2545,23 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 t; __be32 s; if (pkt_dev->flags & F_IPDST_RND) { - - t = random32() % (imx - imn) + imn; + if (imx - imn) { + t = (random32() % (imx - imn)) + imn; + } + else { + t = imn; + } s = htonl(t); - while (ipv4_is_loopback(s) || - ipv4_is_multicast(s) || - ipv4_is_lbcast(s) || - ipv4_is_zeronet(s) || - ipv4_is_local_multicast(s)) { - t = random32() % (imx - imn) + imn; + while (ipv4_is_loopback(s) || ipv4_is_multicast(s) + || ipv4_is_lbcast(s) || ipv4_is_zeronet(s) + || ipv4_is_local_multicast(s)) { + if (imx - imn) { + t = (random32() % (imx - imn)) + imn; + } + else { + t = imn; + } s = htonl(t); } pkt_dev->cur_daddr = s; @@ -2337,12 +2632,12 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) } pkt_dev->cur_queue_map = t; } - + pkt_dev->flows[flow].count++; } -#ifdef CONFIG_XFRM +//#ifdef CONFIG_XFRM static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) { struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; @@ -2368,6 +2663,7 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) x->curlft.bytes +=skb->len; x->curlft.packets++; + error: spin_unlock(&x->lock); return err; @@ -2424,7 +2720,6 @@ static inline int process_ipsec(struct pktgen_dev *pkt_dev, } return 1; } -#endif static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev) { @@ -2457,7 +2752,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */ __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ - + int cur_pkt_size; if (pkt_dev->nr_labels) protocol = htons(ETH_P_MPLS_UC); @@ -2471,12 +2766,14 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, mod_cur_headers(pkt_dev); datalen = (odev->hard_header_len + 16) & ~0xf; - skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + datalen + + cur_pkt_size = pkt_dev->cur_pkt_size; /* protect against race */ + skb = alloc_skb(cur_pkt_size + 64 + datalen + pkt_dev->pkt_overhead, GFP_ATOMIC); if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; } + pkt_dev->seq_num++; /* Increase the pktgen sequence number for the next packet. */ skb_reserve(skb, datalen); @@ -2506,6 +2803,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->network_header = skb->tail; skb->transport_header = skb->network_header + sizeof(struct iphdr); skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr)); + skb_set_queue_mapping(skb, pkt_dev->cur_queue_map); iph = ip_hdr(skb); udph = udp_hdr(skb); @@ -2514,7 +2812,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, *(__be16 *) & eth[12] = protocol; /* Eth + IPh + UDPh + mpls */ - datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - + datalen = cur_pkt_size - 14 - 20 - 8 - pkt_dev->pkt_overhead; if (datalen < sizeof(struct pktgen_hdr)) datalen = sizeof(struct pktgen_hdr); @@ -2608,11 +2906,12 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, pgh->tv_sec = htonl(timestamp.tv_sec); pgh->tv_usec = htonl(timestamp.tv_usec); } - +#if 0 #ifdef CONFIG_XFRM if (!process_ipsec(pkt_dev, skb, protocol)) return NULL; #endif +#endif return skb; } @@ -2650,9 +2949,10 @@ static unsigned int scan_ip6(const char *s, char ip[16]) } s++; } - + u = simple_strtoul(s, &pos, 16); i = pos - s; + if (!i) return 0; if (prefixlen == 12 && s[i] == '.') { @@ -2796,7 +3096,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */ __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ - + int cur_pkt_size; + if (pkt_dev->nr_labels) protocol = htons(ETH_P_MPLS_UC); @@ -2808,7 +3109,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, */ mod_cur_headers(pkt_dev); - skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16 + + cur_pkt_size = pkt_dev->cur_pkt_size; + skb = alloc_skb(cur_pkt_size + 64 + 16 + pkt_dev->pkt_overhead, GFP_ATOMIC); if (!skb) { sprintf(pkt_dev->result, "No memory"); @@ -2843,6 +3145,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, skb->network_header = skb->tail; skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr)); + skb_set_queue_mapping(skb, pkt_dev->cur_queue_map); iph = ipv6_hdr(skb); udph = udp_hdr(skb); @@ -2851,7 +3154,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, *(__be16 *) & eth[12] = protocol; /* Eth + IPh + UDPh + mpls */ - datalen = pkt_dev->cur_pkt_size - 14 - + datalen = cur_pkt_size - 14 - sizeof(struct ipv6hdr) - sizeof(struct udphdr) - pkt_dev->pkt_overhead; @@ -2954,7 +3257,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, pgh->tv_sec = htonl(timestamp.tv_sec); pgh->tv_usec = htonl(timestamp.tv_usec); } - /* pkt_dev->seq_num++; FF: you really mean this? */ + pkt_dev->seq_num++; /* Increase the pktgen seq number for the next packet. */ return skb; } @@ -2968,13 +3271,206 @@ static inline struct sk_buff *fill_packet(struct net_device *odev, return fill_packet_ipv4(odev, pkt_dev); } -static void pktgen_clear_counters(struct pktgen_dev *pkt_dev) -{ - pkt_dev->seq_num = 1; - pkt_dev->idle_acc = 0; + +static void record_latency(struct pktgen_dev* pkt_dev, int latency) { + /* NOTE: Latency can be negative */ + int div = 100; + int diff; + int vl; + int i; + + pkt_dev->pkts_rcvd_since_clear++; + + if (pkt_dev->pkts_rcvd_since_clear < 100) { + div = pkt_dev->pkts_rcvd; + if (pkt_dev->pkts_rcvd_since_clear == 1) { + pkt_dev->avg_latency = latency; + } + } + + if ((div + 1) == 0) { + pkt_dev->avg_latency = 0; + } + else { + pkt_dev->avg_latency = ((pkt_dev->avg_latency * div + latency) / (div + 1)); + } + + if (latency < pkt_dev->min_latency) { + pkt_dev->min_latency = latency; + } + if (latency > pkt_dev->max_latency) { + pkt_dev->max_latency = latency; + } + + /* Place the latency in the right 'bucket' */ + diff = (latency - pkt_dev->min_latency); + for (i = 0; ilatency_bkts[i]++; + break; + } + } +}/* record latency */ + + +/* Returns < 0 if the skb is not a pktgen buffer. */ +int pktgen_receive(struct sk_buff* skb) { + /* See if we have a pktgen packet */ + /* TODO: Add support for detecting IPv6, TCP packets too. This will only + * catch UDP at the moment. --Ben + */ + /* printk("pktgen-rcv, skb->len: %d\n", skb->len); */ + + /* If this is a paged skb, make sure we pull up + * whatever data we need to look at. */ + if (!pskb_may_pull(skb, 20 + 8 + sizeof(struct pktgen_hdr))) { + return -1; + } + + if ((skb->len >= (20 + 8 + sizeof(struct pktgen_hdr))) && + (skb->protocol == __constant_htons(ETH_P_IP))) { + struct pktgen_hdr* pgh; + + /* It's IP, and long enough, lets check the magic number. + * TODO: This is a hack not always guaranteed to catch the right + * packets. + */ + + /* printk("Length & protocol passed, skb->data: %p, raw: %p\n", + skb->data, skb->h.raw); */ + + pgh = (struct pktgen_hdr*)(skb->data + 20 + 8); + + /* + tmp = (char*)(skb->data); + for (i = 0; i<90; i++) { + printk("%02hx ", tmp[i]); + if (((i + 1) % 15) == 0) { + printk("\n"); + } + } + printk("\n"); + */ + + if (pgh->pgh_magic == __constant_ntohl(PKTGEN_MAGIC)) { + struct net_device* dev = skb->dev; + struct pktgen_dev* pkt_dev; + __u32 seq = ntohl(pgh->seq_num); + + // TODO: Need lock..maybe + pkt_dev = dev->pkt_dev; + + if (!pkt_dev) { + return -1; + } + + pkt_dev->pkts_rcvd++; + pkt_dev->bytes_rcvd += skb->len; + + /* Check for out-of-sequence packets */ + if (pkt_dev->last_seq_rcvd == seq) { + pkt_dev->dup_rcvd++; + pkt_dev->dup_since_incr++; + } + else { + __s64 rx; + __s64 tx; + struct timeval txtv; + if (! skb->tstamp.tv64) { + __net_timestamp(skb); + } + skb_get_timestamp(skb, &txtv); + rx = tv_to_us(&txtv); + + txtv.tv_usec = ntohl(pgh->tv_usec); + txtv.tv_sec = ntohl(pgh->tv_sec); + tx = tv_to_us(&txtv); + record_latency(pkt_dev, rx - tx); + + if ((pkt_dev->last_seq_rcvd + 1) == seq) { + if ((pkt_dev->peer_clone_skb > 1) && + (pkt_dev->peer_clone_skb > (pkt_dev->dup_since_incr + 1))) { + + pkt_dev->seq_gap_rcvd += (pkt_dev->peer_clone_skb - + pkt_dev->dup_since_incr - 1); + } + /* Great, in order...all is well */ + } + else if (pkt_dev->last_seq_rcvd < seq) { + /* sequence gap, means we dropped a pkt most likely */ + if (pkt_dev->peer_clone_skb > 1) { + /* We dropped more than one sequence number's worth, + * and if we're using clone_skb, then this is quite + * a few. This number still will not be exact, but + * it will be closer. + */ + pkt_dev->seq_gap_rcvd += (((seq - pkt_dev->last_seq_rcvd) * + pkt_dev->peer_clone_skb) - + pkt_dev->dup_since_incr); + } + else { + pkt_dev->seq_gap_rcvd += (seq - pkt_dev->last_seq_rcvd - 1); + } + } + else { + pkt_dev->ooo_rcvd++; /* out-of-order */ + } + + pkt_dev->dup_since_incr = 0; + } + pkt_dev->last_seq_rcvd = seq; + kfree_skb(skb); + if (debug > 1) { + printk("done with pktgen_receive, free'd pkt\n"); + } + return 0; + } + } + return -1; /* Let another protocol handle it, it's not for us! */ +}/* pktgen_receive */ + +static void pg_reset_latency_counters(struct pktgen_dev* pkt_dev) { + int i; + pkt_dev->avg_latency = 0; + pkt_dev->min_latency = 0x7fffffff; /* largest integer */ + pkt_dev->max_latency = 0x80000000; /* smallest integer */ + pkt_dev->pkts_rcvd_since_clear = 0; + for (i = 0; ilatency_bkts[i] = 0; + } +} + + +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev, int seq_too) { + pkt_dev->idle_acc_ns = 0; pkt_dev->sofar = 0; pkt_dev->tx_bytes = 0; pkt_dev->errors = 0; + pkt_dev->pkts_rcvd_since_clear = 0; + + pkt_dev->ooo_rcvd = 0; + pkt_dev->dup_rcvd = 0; + pkt_dev->pkts_rcvd = 0; + pkt_dev->bytes_rcvd = 0; + pkt_dev->non_pg_pkts_rcvd = 0; + pkt_dev->seq_gap_rcvd = 0; /* dropped */ + + /* Clear some transient state */ + pkt_dev->accum_delay_ns = 0; + pkt_dev->sleeps = 0; + pkt_dev->nanodelays = 0; + + /* This is a bit of a hack, but it gets the dup counters + * in line so we don't have false alarms on dropped pkts. + */ + if (seq_too) { + pkt_dev->dup_since_incr = pkt_dev->peer_clone_skb - 1; + pkt_dev->seq_num = 0; + pkt_dev->last_seq_rcvd = 0; + } + + pg_reset_latency_counters(pkt_dev); } /* Set up structure for sending pkts, clear counters */ @@ -2986,31 +3482,31 @@ static void pktgen_run(struct pktgen_thread *t) pr_debug("pktgen: entering pktgen_run. %p\n", t); - if_lock(t); list_for_each_entry(pkt_dev, &t->if_list, list) { + /* If already running, then ignore. */ + if (! pkt_dev->running) { + + /** Clear counters before we setup the first inject. */ + pktgen_clear_counters(pkt_dev, 1); - /* - * setup odev and create initial packet. - */ - pktgen_setup_inject(pkt_dev); - - if (pkt_dev->odev) { - pktgen_clear_counters(pkt_dev); - pkt_dev->running = 1; /* Cranke yeself! */ - pkt_dev->skb = NULL; - pkt_dev->started_at = getCurUs(); - pkt_dev->next_tx_us = getCurUs(); /* Transmit immediately */ - pkt_dev->next_tx_ns = 0; - set_pkt_overhead(pkt_dev); - - strcpy(pkt_dev->result, "Starting"); - started++; - } else - strcpy(pkt_dev->result, "Error starting"); + /* + * setup odev and create initial packet. + */ + pktgen_setup_inject(pkt_dev); + + if (pkt_dev->odev) { + pkt_dev->running = 1; /* Cranke yeself! */ + pkt_dev->skb = NULL; + pkt_dev->started_at = getCurUs(); + /* Transmit first pkt after 20ms to let listener get started. */ + pkt_dev->next_tx_ns = getRelativeCurNs() + 20 * 1000000; + + strcpy(pkt_dev->result, "Starting"); + started++; + } else + strcpy(pkt_dev->result, "Error starting"); + } } - if_unlock(t); - if (started) - t->control &= ~(T_STOP); } static void pktgen_stop_all_threads_ifs(void) @@ -3026,66 +3522,11 @@ static void pktgen_stop_all_threads_ifs(void) mutex_unlock(&pktgen_thread_lock); } - -static int thread_is_running(struct pktgen_thread *t) -{ - struct pktgen_dev *pkt_dev; - int res = 0; - - list_for_each_entry(pkt_dev, &t->if_list, list) - if (pkt_dev->running) { - res = 1; - break; - } - return res; -} - -static int pktgen_wait_thread_run(struct pktgen_thread *t) -{ - if_lock(t); - - while (thread_is_running(t)) { - - if_unlock(t); - - msleep_interruptible(100); - - if (signal_pending(current)) - goto signal; - if_lock(t); - } - if_unlock(t); - return 1; -signal: - return 0; -} - -static int pktgen_wait_all_threads_run(void) -{ +static void pktgen_run_all_threads(int background) { struct pktgen_thread *t; - int sig = 1; - - mutex_lock(&pktgen_thread_lock); - - list_for_each_entry(t, &pktgen_threads, th_list) { - sig = pktgen_wait_thread_run(t); - if (sig == 0) - break; - } - if (sig == 0) - list_for_each_entry(t, &pktgen_threads, th_list) - t->control |= (T_STOP); - - mutex_unlock(&pktgen_thread_lock); - return sig; -} - -static void pktgen_run_all_threads(void) -{ - struct pktgen_thread *t; - - pr_debug("pktgen: entering pktgen_run_all_threads.\n"); + pr_debug("pktgen: entering pktgen_run_all_threads, background: %d\n", + background); mutex_lock(&pktgen_thread_lock); @@ -3094,9 +3535,14 @@ static void pktgen_run_all_threads(void) mutex_unlock(&pktgen_thread_lock); - schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */ + /* This is a hack at best...disabling, we should not have to depend on this. */ + /*schedule_timeout_interruptible(msecs_to_jiffies(125));*/ /* Propagate thread->control */ - pktgen_wait_all_threads_run(); + // Much harder to get rid of the if_lock if we allow this to block... + if (!background) { + printk("ERROR: non-background mode no longer supported.\n"); + //pktgen_wait_all_threads_run(); + } } static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) @@ -3106,7 +3552,7 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) total_us = pkt_dev->stopped_at - pkt_dev->started_at; - idle = pkt_dev->idle_acc; + idle = do_div(pkt_dev->idle_acc_ns, 1000); p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n", (unsigned long long)total_us, @@ -3122,6 +3568,11 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) total_us >>= 1; } + if (total_us == 0) { + //printk("pktgen: WARNING: %s Ran for zero time, bumping to 1ms to avoid div by zero.\n", + // pkt_dev->ifname); + total_us = 1; + } do_div(pps, total_us); bps = pps * 8 * pkt_dev->cur_pkt_size; @@ -3155,22 +3606,90 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev) return 0; } -static struct pktgen_dev *next_to_run(struct pktgen_thread *t) -{ - struct pktgen_dev *pkt_dev, *best = NULL; - - if_lock(t); +/** Find the adapter that needs to tx next. + * We need to take the blocked adapters into account, but can't ignore + * them forever just in case we missed the tx-queue-wake event for some + * reason. + */ +static struct pktgen_dev *next_to_run(struct pktgen_thread *t, u64 now, u64* next_running_delay) { + struct pktgen_dev *pkt_dev = NULL; + struct pktgen_dev *best = NULL; + struct pktgen_dev *best_blocked = NULL; + struct pktgen_dev *rv = NULL; list_for_each_entry(pkt_dev, &t->if_list, list) { if (!pkt_dev->running) continue; - if (best == NULL) - best = pkt_dev; - else if (pkt_dev->next_tx_us < best->next_tx_us) - best = pkt_dev; + if (pkt_dev->tx_blocked) { + if (best_blocked == NULL) + best_blocked = pkt_dev; + else { + if (pkt_dev->next_tx_ns < best_blocked->next_tx_ns) { + best_blocked = pkt_dev; + } + } + //pkt_dev->tx_blocked = 0; /* give it another try next time */ + } + else { + if (best == NULL) + best = pkt_dev; + else { + if (pkt_dev->next_tx_ns < best->next_tx_ns) { + best = pkt_dev; + } + } + } } - if_unlock(t); - return best; + + /** If we have both blocked and non-blocked, and non-blocked wants to transmit now, then + * choose it. Otherwise, just choose whoever wants to run next. + */ + if (best_blocked && best) { + if (((best_blocked->next_tx_ns + PG_TRY_TX_ANYWAY_NS) < now) && + (best_blocked->next_tx_ns < best->next_tx_ns)) { + rv = best_blocked; + } + else if (best->next_tx_ns <= now) { + rv = best; + } + else if (best->next_tx_ns < best_blocked->next_tx_ns) { + rv = best; + } + else { + rv = best_blocked; + } + } + + if (!rv) { + if (best_blocked && (best_blocked->next_tx_ns < (now - PG_TRY_TX_ANYWAY_NS))) { + rv = best_blocked; + } + } + if (!rv) { + rv = best; + } + if (!rv) { + rv = best_blocked; + } + + if (rv) { + /* If best is blocked, we should delay a bit */ + if (rv->tx_blocked) { + *next_running_delay = PG_TRY_TX_ANYWAY_NS; //1ms + } + else { + if (rv->next_tx_ns <= now) { + *next_running_delay = 0; + } + else { + *next_running_delay = rv->next_tx_ns - now; + } + } + } + else { + *next_running_delay = 10000000; /* 10ms */ + } + return rv; } static void pktgen_stop(struct pktgen_thread *t) @@ -3179,8 +3698,6 @@ static void pktgen_stop(struct pktgen_thread *t) pr_debug("pktgen: entering pktgen_stop\n"); - if_lock(t); - list_for_each_entry(pkt_dev, &t->if_list, list) { pktgen_stop_device(pkt_dev); if (pkt_dev->skb) @@ -3188,8 +3705,6 @@ static void pktgen_stop(struct pktgen_thread *t) pkt_dev->skb = NULL; } - - if_unlock(t); } /* @@ -3203,8 +3718,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) pr_debug("pktgen: entering pktgen_rem_one_if\n"); - if_lock(t); - list_for_each_safe(q, n, &t->if_list) { cur = list_entry(q, struct pktgen_dev, list); @@ -3219,10 +3732,15 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) break; } - - if_unlock(t); } +static void pktgen_unblock_all_ifs(struct pktgen_thread *t) { + struct pktgen_dev *p = NULL;; + list_for_each_entry(p, &t->if_list, list) + p->tx_blocked = 0; +}/* wake all writers */ + + static void pktgen_rem_all_ifs(struct pktgen_thread *t) { struct list_head *q, *n; @@ -3231,8 +3749,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) /* Remove all devices, free mem */ pr_debug("pktgen: entering pktgen_rem_all_ifs\n"); - if_lock(t); - list_for_each_safe(q, n, &t->if_list) { cur = list_entry(q, struct pktgen_dev, list); @@ -3242,8 +3758,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) pktgen_remove_device(t, cur); } - - if_unlock(t); } static void pktgen_rem_thread(struct pktgen_thread *t) @@ -3259,55 +3773,45 @@ static void pktgen_rem_thread(struct pktgen_thread *t) mutex_unlock(&pktgen_thread_lock); } -static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) +static void pktgen_xmit(struct pktgen_dev *pkt_dev, u64 now) { - struct net_device *odev = NULL; + struct net_device *odev; __u64 idle_start = 0; int ret; odev = pkt_dev->odev; - if (pkt_dev->delay_us || pkt_dev->delay_ns) { - u64 now; - - now = getCurUs(); - if (now < pkt_dev->next_tx_us) - spin(pkt_dev, pkt_dev->next_tx_us); + if (pkt_dev->delay_ns || (pkt_dev->accum_delay_ns > 0)) { + if (now < pkt_dev->next_tx_ns) { + /* Don't tx early..*/ + pkt_dev->req_tx_early++; + goto out; + } /* This is max DELAY, this has special meaning of * "never transmit" */ - if (pkt_dev->delay_us == 0x7FFFFFFF) { - pkt_dev->next_tx_us = getCurUs() + pkt_dev->delay_us; - pkt_dev->next_tx_ns = pkt_dev->delay_ns; + if (pkt_dev->delay_ns == 0x7FFFFFFF) { + pkt_dev->next_tx_ns = getRelativeCurNs() + pkt_dev->delay_ns; goto out; } } - if ((netif_queue_stopped(odev) || - (pkt_dev->skb && - netif_subqueue_stopped(odev, pkt_dev->skb))) || + if (netif_queue_stopped(odev) || + (!netif_carrier_ok(odev)) || + (pkt_dev->skb && netif_subqueue_stopped(odev, pkt_dev->skb)) || need_resched()) { - idle_start = getCurUs(); - + pkt_dev->queue_stopped++; + pkt_dev->tx_blocked = 1; + /* change tx time to now to show work was at least attempted. */ + pkt_dev->next_tx_ns = now; if (!netif_running(odev)) { pktgen_stop_device(pkt_dev); if (pkt_dev->skb) kfree_skb(pkt_dev->skb); pkt_dev->skb = NULL; - goto out; - } - if (need_resched()) - schedule(); - - pkt_dev->idle_acc += getCurUs() - idle_start; - - if (netif_queue_stopped(odev) || - netif_subqueue_stopped(odev, pkt_dev->skb)) { - pkt_dev->next_tx_us = getCurUs(); /* TODO */ - pkt_dev->next_tx_ns = 0; - goto out; /* Try the next interface */ } + goto out; /* try next interface */ } if (pkt_dev->last_ok || !pkt_dev->skb) { @@ -3319,10 +3823,11 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->skb = fill_packet(odev, pkt_dev); if (pkt_dev->skb == NULL) { - printk(KERN_ERR "pktgen: ERROR: couldn't " - "allocate skb in fill_packet.\n"); + //printk(KERN_ERR "pktgen: ERROR: couldn't " + // "allocate skb in fill_packet.\n"); schedule(); pkt_dev->clone_count--; /* back out increment, OOM */ + pkt_dev->oom_on_alloc_skb++; goto out; } pkt_dev->allocated_skbs++; @@ -3340,40 +3845,43 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) if (likely(ret == NETDEV_TX_OK)) { pkt_dev->last_ok = 1; pkt_dev->sofar++; - pkt_dev->seq_num++; pkt_dev->tx_bytes += pkt_dev->cur_pkt_size; + pkt_dev->next_tx_ns = getRelativeCurNs() + pkt_dev->delay_ns; + pkt_dev->tx_blocked = 0; } else if (ret == NETDEV_TX_LOCKED && (odev->features & NETIF_F_LLTX)) { cpu_relax(); goto retry_now; } else { /* Retry it next time */ - + static int do_once_hsx_wrn = 1; + if (do_once_hsx_wrn) { + printk(KERN_INFO "pktgen: Hard xmit error, driver for %s doesn't do queue-stopped quite right.\n", odev->name); + printk(KERN_INFO "pktgen: Transmit request will be retried, and this error msg will not be printed again..\n"); + do_once_hsx_wrn = 0; + } + atomic_dec(&(pkt_dev->skb->users)); - if (debug && net_ratelimit()) - printk(KERN_INFO "pktgen: Hard xmit error\n"); - + pkt_dev->queue_stopped++; pkt_dev->errors++; pkt_dev->last_ok = 0; - } - - pkt_dev->next_tx_us = getCurUs(); - pkt_dev->next_tx_ns = 0; - - pkt_dev->next_tx_us += pkt_dev->delay_us; - pkt_dev->next_tx_ns += pkt_dev->delay_ns; - if (pkt_dev->next_tx_ns > 1000) { - pkt_dev->next_tx_us++; - pkt_dev->next_tx_ns -= 1000; + /* Try a little later..flag us as wanting to tx, but unable. Will try again shortly. + */ + pkt_dev->tx_blocked = 1; + /* change tx time to now to show work was at least attempted. */ + pkt_dev->next_tx_ns = now; } } - else { /* Retry it next time */ + pkt_dev->queue_stopped++; pkt_dev->last_ok = 0; - pkt_dev->next_tx_us = getCurUs(); /* TODO */ - pkt_dev->next_tx_ns = 0; + /* Try a little later..flag us as wanting to tx, but unable. Will try again shortly. + */ + pkt_dev->tx_blocked = 1; + /* change tx time to now to show work was at least attempted. */ + pkt_dev->next_tx_ns = now; } netif_tx_unlock_bh(odev); @@ -3381,14 +3889,14 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) /* If pkt_dev->count is zero, then run forever */ if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { if (atomic_read(&(pkt_dev->skb->users)) != 1) { - idle_start = getCurUs(); + idle_start = getRelativeCurNs(); while (atomic_read(&(pkt_dev->skb->users)) != 1) { if (signal_pending(current)) { break; } schedule(); } - pkt_dev->idle_acc += getCurUs() - idle_start; + pkt_dev->idle_acc_ns += getRelativeCurNs() - idle_start; } /* Done with this */ @@ -3410,7 +3918,9 @@ static int pktgen_thread_worker(void *arg) struct pktgen_thread *t = arg; struct pktgen_dev *pkt_dev = NULL; int cpu = t->cpu; - + u64 now; + u64 next_running_delay; + BUG_ON(smp_processor_id() != cpu); init_waitqueue_head(&t->queue); @@ -3422,7 +3932,15 @@ static int pktgen_thread_worker(void *arg) set_freezable(); while (!kthread_should_stop()) { - pkt_dev = next_to_run(t); + find_best: + + if (t->control & T_WAKE_BLOCKED) { + pktgen_unblock_all_ifs(t); + t->control &= ~(T_WAKE_BLOCKED); + } + + now = getRelativeCurNs(); + pkt_dev = next_to_run(t, now, &next_running_delay); if (!pkt_dev && (t->control & (T_STOP | T_RUN | T_REMDEVALL | T_REMDEV)) @@ -3435,8 +3953,41 @@ static int pktgen_thread_worker(void *arg) __set_current_state(TASK_RUNNING); - if (pkt_dev) - pktgen_xmit(pkt_dev); + if (pkt_dev) { + if (pkt_dev->tx_blocked) { + /* If blocked for less than 1ms, then sleep for up to 1ms. If the + * device un-blocks, then we will be woken by the wait-queue callback. + */ + u64 tx_anyway_ns = (now - PG_TRY_TX_ANYWAY_NS); + if (pkt_dev->next_tx_ns > tx_anyway_ns) { + pg_nanodelay(min(next_running_delay, (u64)(PG_TRY_TX_ANYWAY_NS)), + pkt_dev); + /* Maybe things have changed since we went to sleep. */ + goto find_best; + } + } + + /* If the best to run should not run yet, then sleep (or accumulate sleep) */ + if (now < pkt_dev->next_tx_ns) { + /* spin(pkt_dev, pkt_dev->next_tx_us); */ + u64 next_ipg = pkt_dev->next_tx_ns - now; + + /* These will not actually busy-spin now. Will run as + * much as 1ms fast, and will sleep in 1ms units, assuming + * our tick is 1ms. + */ + pg_nanodelay(next_ipg, pkt_dev); + now = getRelativeCurNs(); + if (pkt_dev->removal_mark || + (pkt_dev->pg_thread->control && T_STOP)) { + goto skip_tx; + } + } + + + pktgen_xmit(pkt_dev, now); + } + skip_tx: if (t->control & T_STOP) { pktgen_stop(t); @@ -3448,6 +3999,11 @@ static int pktgen_thread_worker(void *arg) t->control &= ~(T_RUN); } + if (t->control & T_ADD_DEV) { + pktgen_add_device(t, (char*)(t->control_arg)); + t->control &= ~(T_ADD_DEV); + } + if (t->control & T_REMDEVALL) { pktgen_rem_all_ifs(t); t->control &= ~(T_REMDEVALL); @@ -3479,16 +4035,11 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, const char *ifname) { struct pktgen_dev *p, *pkt_dev = NULL; - if_lock(t); - list_for_each_entry(p, &t->if_list, list) if (strncmp(p->odev->name, ifname, IFNAMSIZ) == 0) { pkt_dev = p; break; } - - if_unlock(t); - pr_debug("pktgen: find_dev(%s) returning %p\n", ifname, pkt_dev); return pkt_dev; } @@ -3501,8 +4052,6 @@ static int add_dev_to_thread(struct pktgen_thread *t, { int rv = 0; - if_lock(t); - if (pkt_dev->pg_thread) { printk(KERN_ERR "pktgen: ERROR: already assigned " "to a thread.\n"); @@ -3515,12 +4064,9 @@ static int add_dev_to_thread(struct pktgen_thread *t, pkt_dev->running = 0; out: - if_unlock(t); return rv; } -/* Called under thread lock */ - static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) { struct pktgen_dev *pkt_dev; @@ -3533,7 +4079,10 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) printk(KERN_ERR "pktgen: ERROR: interface already used.\n"); return -EBUSY; } - + else { + printk("pktgen: Attempting to add device: %s\n", ifname); + } + pkt_dev = kzalloc(sizeof(struct pktgen_dev), GFP_KERNEL); if (!pkt_dev) return -ENOMEM; @@ -3550,8 +4099,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->max_pkt_size = ETH_ZLEN; pkt_dev->nfrags = 0; pkt_dev->clone_skb = pg_clone_skb_d; - pkt_dev->delay_us = pg_delay_d / 1000; - pkt_dev->delay_ns = pg_delay_d % 1000; + pkt_dev->delay_ns = pg_delay_d; pkt_dev->count = pg_count_d; pkt_dev->sofar = 0; pkt_dev->udp_src_min = 9; /* sink port */ @@ -3565,13 +4113,15 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->svlan_p = 0; pkt_dev->svlan_cfi = 0; pkt_dev->svlan_id = 0xffff; - - err = pktgen_setup_dev(pkt_dev, ifname); + strncpy(pkt_dev->ifname, ifname, sizeof(pkt_dev->ifname)); + + err = pktgen_setup_dev(pkt_dev, t); if (err) goto out1; pkt_dev->entry = proc_create(ifname, 0600, pg_proc_dir, &pktgen_if_fops); + if (!pkt_dev->entry) { printk(KERN_ERR "pktgen: cannot create %s/%s procfs entry.\n", PG_PROC_DIR, ifname); @@ -3588,9 +4138,11 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) out2: dev_put(pkt_dev->odev); out1: +#if 0 #ifdef CONFIG_XFRM free_SAs(pkt_dev); #endif +#endif if (pkt_dev->flows) vfree(pkt_dev->flows); kfree(pkt_dev); @@ -3610,7 +4162,6 @@ static int __init pktgen_create_thread(int cpu) return -ENOMEM; } - spin_lock_init(&t->if_lock); t->cpu = cpu; INIT_LIST_HEAD(&t->if_list); @@ -3676,6 +4227,14 @@ static int pktgen_remove_device(struct pktgen_thread *t, /* Dis-associate from the interface */ if (pkt_dev->odev) { + +#ifdef USE_NQW_CALLBACK + /* Set the nqw callback hooks */ + rtnl_lock(); + clear_nqw_hook(t, pkt_dev->odev); + rtnl_unlock(); +#endif + pkt_dev->odev->pkt_dev = NULL; dev_put(pkt_dev->odev); pkt_dev->odev = NULL; } @@ -3686,10 +4245,11 @@ static int pktgen_remove_device(struct pktgen_thread *t, if (pkt_dev->entry) remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); - +#if 0 #ifdef CONFIG_XFRM free_SAs(pkt_dev); #endif +#endif if (pkt_dev->flows) vfree(pkt_dev->flows); kfree(pkt_dev); @@ -3702,6 +4262,15 @@ static int __init pg_init(void) struct proc_dir_entry *pe; printk(KERN_INFO "%s", version); + printk(KERN_INFO "sizeof report: %d, sizeof in6_addr: %d\n", + (int)(sizeof(struct pktgen_dev_report)), + (int)(sizeof(struct in6_addr))); + + if (handle_pktgen_hook) { + printk(KERN_ERR "pktgen: ERROR: pktgen is already loaded it seems..\n"); + /* Already loaded */ + return -EEXIST; + } pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net); if (!pg_proc_dir) @@ -3739,6 +4308,9 @@ static int __init pg_init(void) return -ENODEV; } + handle_pktgen_hook = pktgen_receive; + pr_debug("pktgen initialization complete.\n"); + return 0; } @@ -3761,8 +4333,11 @@ static void __exit pg_cleanup(void) unregister_netdevice_notifier(&pktgen_notifier_block); /* Clean up proc file system */ + pr_debug("pktgen: removing proc entry: %s (0x%p)\n", PGCTRL, pg_proc_dir); remove_proc_entry(PGCTRL, pg_proc_dir); proc_net_remove(&init_net, PG_PROC_DIR); + + handle_pktgen_hook = NULL; } module_init(pg_init); diff --git a/net/core/pktgen.h b/net/core/pktgen.h new file mode 100644 index 0000000..2d4cfe8 --- /dev/null +++ b/net/core/pktgen.h @@ -0,0 +1,383 @@ +/* -*-linux-c-*- + * $Id: candela_2.6.13.patch,v 1.3 2005/09/30 04:45:31 greear Exp $ + * pktgen.c: Packet Generator for performance evaluation. + * + * See pktgen.c for details of changes, etc. +*/ + + +#ifndef PKTGEN_H_INCLUDE_KERNEL__ +#define PKTGEN_H_INCLUDE_KERNEL__ + +#include +#include + +/* The buckets are exponential in 'width' */ +#define LAT_BUCKETS_MAX 32 +#define PG_MAX_ACCUM_DELAY_NS 1000000 /* one ms */ +#define PG_TRY_TX_ANYWAY_NS 1000000 /* try a blocked tx queue after 1 ms. */ + +#define IP_NAME_SZ 32 +#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ +#define MPLS_STACK_BOTTOM __constant_htonl(0x00000100) + +/* Device flag bits */ +#define F_IPSRC_RND (1<<0) /* IP-Src Random */ +#define F_IPDST_RND (1<<1) /* IP-Dst Random */ +#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */ +#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */ +#define F_MACSRC_RND (1<<4) /* MAC-Src Random */ +#define F_MACDST_RND (1<<5) /* MAC-Dst Random */ +#define F_TXSIZE_RND (1<<6) /* Transmit packet size is random */ +#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */ +#define F_MPLS_RND (1<<8) /* Random MPLS labels */ +#define F_VID_RND (1<<9) /* Random VLAN ID */ +#define F_SVID_RND (1<<10) /* Random SVLAN ID */ +#define F_FLOW_SEQ (1<<11) /* Sequential flows */ +#define F_IPSEC_ON (1<<12) /* ipsec on for flows */ +#define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ + +/* Thread control flag bits */ +#define T_TERMINATE (1<<0) +#define T_STOP (1<<1) /* Stop run */ +#define T_RUN (1<<2) /* Start run */ +#define T_REMDEVALL (1<<3) /* Remove all devs */ +#define T_REMDEV (1<<4) /* Remove one dev */ +#define T_WAKE_BLOCKED (1<<5) /* Wake up all blocked net-devices. */ +#define T_ADD_DEV (1<<6) /* Add a device. */ + +/* Used to help with determining the pkts on receive */ +#define PKTGEN_MAGIC 0xbe9be955 +#define PG_PROC_DIR "pktgen" +#define PGCTRL "pgctrl" + +#define MAX_CFLOWS 65536 + +#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4) +#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4) + +struct flow_state { + __be32 cur_daddr; + int count; +#ifdef CONFIG_XFRM + struct xfrm_state *x; +#endif + __u32 flags; +}; + +/* flow flag bits */ +#define F_INIT (1<<0) /* flow has been initialized */ + +struct pktgen_dev { + + /* + * Try to keep frequent/infrequent used vars. separated. + */ + char ifname[IFNAMSIZ]; + char result[512]; + + struct proc_dir_entry *entry; /* proc file */ + struct pktgen_thread *pg_thread; /* the owner */ + struct list_head list; /* Used for chaining in the thread's run-queue */ + + int running; /* if this changes to false, the test will stop */ + + /* If min != max, then we will either do a linear iteration, or + * we will do a random selection from within the range. + */ + __u32 flags; + int removal_mark; /* non-zero => the device is marked for + * removal by worker thread */ + + __u32 min_pkt_size; /* = ETH_ZLEN; */ + __u32 max_pkt_size; /* = ETH_ZLEN; */ + int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ + __u32 nfrags; + __u64 delay_ns; /* Delay this much between sending packets. */ + __u64 count; /* Default No packets to send */ + __u64 sofar; /* How many pkts we've sent so far */ + __u64 tx_bytes; /* How many bytes we've transmitted */ + __u64 errors; /* Errors when trying to transmit, pkts will be re-sent */ + __u64 nanodelays; /* how many times have we called nano-delay on this device? */ + __s64 accum_delay_ns; /* Accumulated delay..when >= 1ms, we'll sleep on a wait queue. */ + __u64 sleeps; /* How many times have we gone to sleep on the wait queue. */ + __u64 queue_stopped; /* How many times was queue stopped when we tried to xmit? */ + /* runtime counters relating to clone_skb */ + __u64 next_tx_ns; /* timestamp of when to tx next */ + __u64 req_tx_early; /* requested to tx, but is too early for us to tx. */ + + __u64 oom_on_alloc_skb; + __u64 allocated_skbs; + __u32 clone_count; + + int tx_blocked; /* Need to tx as soon as able... */ + int last_ok; /* Was last skb sent? + * Or a failed transmit of some sort? This will keep + * sequence numbers in order, for example. + */ + __u64 started_at; /* micro-seconds */ + __u64 stopped_at; /* micro-seconds */ + __u64 idle_acc_ns; /* nano-seconds */ + __u32 seq_num; + + __u32 clone_skb; /* Use multiple SKBs during packet gen. If this number + * is greater than 1, then that many copies of the same + * packet will be sent before a new packet is allocated. + * For instance, if you want to send 1024 identical packets + * before creating a new packet, set clone_skb to 1024. + */ + __u32 peer_clone_skb; /* Peer (transmitter's) clone setting. */ + + char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + + struct in6_addr in6_saddr; + struct in6_addr in6_daddr; + struct in6_addr cur_in6_daddr; + struct in6_addr cur_in6_saddr; + /* For ranges */ + struct in6_addr min_in6_daddr; + struct in6_addr max_in6_daddr; + struct in6_addr min_in6_saddr; + struct in6_addr max_in6_saddr; + + /* If we're doing ranges, random or incremental, then this + * defines the min/max for those ranges. + */ + __u32 saddr_min; /* inclusive, source IP address */ + __u32 saddr_max; /* exclusive, source IP address */ + __u32 daddr_min; /* inclusive, dest IP address */ + __u32 daddr_max; /* exclusive, dest IP address */ + + __u16 udp_src_min; /* inclusive, source UDP port */ + __u16 udp_src_max; /* exclusive, source UDP port */ + __u16 udp_dst_min; /* inclusive, dest UDP port */ + __u16 udp_dst_max; /* exclusive, dest UDP port */ + + /* DSCP + ECN */ + __u8 tos; /* six most significant bits of (former) IPv4 TOS are for dscp codepoint */ + __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 (see RFC 3260, sec. 4) */ + + /* MPLS */ + unsigned nr_labels; /* Depth of stack, 0 = no MPLS */ + __be32 labels[MAX_MPLS_LABELS]; + + + /* VLAN/SVLAN (802.1Q/Q-in-Q) */ + __u8 vlan_p; + __u8 vlan_cfi; + __u16 vlan_id; /* 0xffff means no vlan tag */ + + __u8 svlan_p; + __u8 svlan_cfi; + __u16 svlan_id; /* 0xffff means no svlan tag */ + + + __u32 src_mac_count; /* How many MACs to iterate through */ + __u32 dst_mac_count; /* How many MACs to iterate through */ + + unsigned char dst_mac[ETH_ALEN]; + unsigned char src_mac[ETH_ALEN]; + + __u32 cur_dst_mac_offset; + __u32 cur_src_mac_offset; + __u32 cur_saddr; + __u32 cur_daddr; + __u16 cur_udp_dst; + __u16 cur_udp_src; + __u16 cur_queue_map; + __u32 cur_pkt_size; + + __u8 hh[14]; + /* = { + 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, + + We fill in SRC address later + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x08, 0x00 + }; + */ + __u16 pad; /* pad out the hh struct to an even 16 bytes */ + + struct sk_buff *skb; /* skb we are to transmit next, mainly used for when we + * are transmitting the same one multiple times + */ + struct net_device *odev; /* The out-going device. Note that the device should + * have it's pg_info pointer pointing back to this + * device. This will be set when the user specifies + * the out-going device name (not when the inject is + * started as it used to do.) + */ + struct flow_state *flows; + unsigned cflows; /* Concurrent flows (config) */ + unsigned lflow; /* Flow length (config) */ + unsigned nflows; /* accumulated flows (stats) */ + unsigned curfl; /* current sequenced flow (state)*/ + + u16 queue_map_min; + u16 queue_map_max; + +#ifdef CONFIG_XFRM + __u8 ipsmode; /* IPSEC mode (config) */ + __u8 ipsproto; /* IPSEC type (config) */ +#endif + + int avg_latency; /* in micro-seconds */ + int min_latency; + int max_latency; + __u64 latency_bkts[LAT_BUCKETS_MAX]; + __u64 pkts_rcvd_since_clear; /* with regard to clearing/resetting the latency logic */ + + + /* Fields relating to receiving pkts */ + __u32 last_seq_rcvd; + __u64 ooo_rcvd; /* out-of-order packets received */ + __u64 pkts_rcvd; /* packets received */ + __u64 dup_rcvd; /* duplicate packets received */ + __u64 bytes_rcvd; /* total bytes received, as obtained from the skb */ + __u64 seq_gap_rcvd; /* how many gaps we received. This coorelates to + * dropped pkts, except perhaps in cases where we also + * have re-ordered pkts. In that case, you have to tie-break + * by looking at send v/s received pkt totals for the interfaces + * involved. + */ + __u64 non_pg_pkts_rcvd; /* Count how many non-pktgen skb's we are sent to check. */ + __u64 dup_since_incr; /* How many dumplicates since the last seq number increment, + * used to detect gaps when multiskb > 1 + */ +}; + +struct pktgen_hdr { + __u32 pgh_magic; + __u32 seq_num; + __u32 tv_sec; + __u32 tv_usec; +}; + +struct pktgen_thread { + struct list_head if_list; /* All device here */ + struct list_head th_list; + struct task_struct* tsk; + int removed; + char result[512]; + + /* Field for thread to receive "posted" events terminate, stop ifs etc. */ + + u32 control; + char* control_arg; + int pid; + int cpu; + + wait_queue_head_t queue; +}; + +struct pg_nqw_data { + #define PG_NQW_MAGIC 0x82743ab6 + u32 magic; + atomic_t nqw_ref_count; + struct pktgen_thread* pg_thread; +}; + +struct pktgen_dev_report { + __u32 api_version; + __u32 flags; + __u32 min_pkt_size; + __u32 max_pkt_size; + __u32 nfrags; + + __u32 clone_skb; /* Use multiple SKBs during packet gen. If this number + * is greater than 1, then that many copies of the same + * packet will be sent before a new packet is allocated. + * For instance, if you want to send 1024 identical packets + * before creating a new packet, set clone_skb to 1024. + */ + __u32 peer_clone_skb; /* Peer (transmitter's) clone setting. */ + __s32 avg_latency; /* in micro-seconds */ + __s32 min_latency; + __s32 max_latency; + + char thread_name[32]; + char interface_name[32]; + char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + unsigned char dst_mac[ETH_ALEN]; + unsigned char src_mac[ETH_ALEN]; + __u32 pad_32; /* pad to 8-byte boundary */ + + /* If we're doing ranges, random or incremental, then this + * defines the min/max for those ranges. + */ + __u32 saddr_min; /* inclusive, source IP address */ + __u32 saddr_max; /* exclusive, source IP address */ + __u32 daddr_min; /* inclusive, dest IP address */ + __u32 daddr_max; /* exclusive, dest IP address */ + + __u16 udp_src_min; /* inclusive, source UDP port */ + __u16 udp_src_max; /* exclusive, source UDP port */ + __u16 udp_dst_min; /* inclusive, dest UDP port */ + __u16 udp_dst_max; /* exclusive, dest UDP port */ + + /* MPLS */ + __u32 nr_labels; /* Depth of stack, 0 = no MPLS */ + __be32 labels[MAX_MPLS_LABELS]; + + __u32 src_mac_count; /* How many MACs to iterate through */ + __u32 dst_mac_count; /* How many MACs to iterate through */ + + __u64 nflows; /* accumulated flows (stats) */ + __u32 cflows; /* Concurrent flows (config) */ + __u32 lflow; /* Flow length (config) */ + + __u64 delay_ns; /* Delay this much between sending packets. */ + __u64 count; /* Default No packets to send */ + __u64 sofar; /* How many pkts we've sent so far */ + __u64 tx_bytes; /* How many bytes we've transmitted */ + __u64 errors; /* Errors when trying to transmit, pkts will be re-sent */ + __u64 latency_bkts[LAT_BUCKETS_MAX]; + __u64 pkts_rcvd_since_clear; /* with regard to clearing/resetting the latency logic */ + + /* Fields relating to receiving pkts */ + __u64 ooo_rcvd; /* out-of-order packets received */ + __u64 pkts_rcvd; /* packets received */ + __u64 dup_rcvd; /* duplicate packets received */ + __u64 bytes_rcvd; /* total bytes received, as obtained from the skb */ + __u64 seq_gap_rcvd; /* how many gaps we received. This coorelates to + * dropped pkts, except perhaps in cases where we also + * have re-ordered pkts. In that case, you have to tie-break + * by looking at send v/s received pkt totals for the interfaces + * involved. + */ + __u64 non_pg_pkts_rcvd; /* Count how many non-pktgen skb's we are sent to check. */ + + struct in6_addr in6_saddr; + struct in6_addr in6_daddr; + /* For ranges */ + struct in6_addr min_in6_daddr; + struct in6_addr max_in6_daddr; + struct in6_addr min_in6_saddr; + struct in6_addr max_in6_saddr; + + char future_use[256]; /* Give us some room for growth w/out changing structure size */ +} __attribute__((__packed__)); + +/* Define some IOCTLs. Just picking random numbers, basically. */ +#define GET_PKTGEN_INTERFACE_INFO 0x7450 +struct pktgen_ioctl_info { + char thread_name[32]; + char interface_name[32]; + struct pktgen_dev_report report; +}; + + +/* Defined in dev.c */ +extern int (*handle_pktgen_hook)(struct sk_buff *skb); + +/* Returns < 0 if the skb is not a pktgen buffer. */ +int pktgen_receive(struct sk_buff* skb); + + +#endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6087013..72bdf68 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -424,6 +424,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) new->ipvs_property = old->ipvs_property; #endif + new->use_specified_ether_crc = old->use_specified_ether_crc; new->protocol = old->protocol; new->mark = old->mark; __nf_copy(new, old); @@ -431,6 +432,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) new->nf_trace = old->nf_trace; #endif + new->use_specified_ether_crc = old->use_specified_ether_crc; #ifdef CONFIG_NET_SCHED new->tc_index = old->tc_index; #ifdef CONFIG_NET_CLS_ACT diff --git a/net/core/sock.c b/net/core/sock.c index 7a0567b..419c88b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -608,6 +608,19 @@ set_rcvbuf: sock_warn_obsolete_bsdism("setsockopt"); break; +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + case SO_NOFCS: + /* printk("SO_NOFCS, valbool: %d, sk: %p\n", + (int)(valbool), sk); */ + if (valbool) { + sk->sk_flags |= SOCK_DONT_DO_LL_FCS; + } + else { + sk->sk_flags &= ~(SOCK_DONT_DO_LL_FCS); + } + break; +#endif + case SO_PASSCRED: if (valbool) set_bit(SOCK_PASSCRED, &sock->flags); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 130338f..363a0ce 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -148,6 +148,17 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, +#ifdef CONFIG_DEBUG_NETDEV + { + .ctl_name = CTL_UNNUMBERED, + .procname = "netdev_debug", + .data = &netdev_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + +#endif { .ctl_name = 0 } }; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 8e17f65..dd454b7 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -416,6 +416,28 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) return !inet_confirm_addr(in_dev, sip, tip, scope); } + +static int is_ip_on_dev(struct net_device* dev, __u32 ip) { + int rv = 0; + struct in_device* in_dev = in_dev_get(dev); + if (in_dev) { + struct in_ifaddr *ifa; + + rcu_read_lock(); + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + if (ifa->ifa_address == ip) { + /* match */ + rv = 1; + break; + } + } + rcu_read_unlock(); + in_dev_put(in_dev); + } + return rv; +} + + static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) { struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip, @@ -427,8 +449,39 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) if (ip_route_output_key(&init_net, &rt, &fl) < 0) return 1; if (rt->u.dst.dev != dev) { - NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); - flag = 1; + struct in_device *in_dev = in_dev_get(dev); + if (in_dev && IN_DEV_ACCEPT_STS(in_dev) && + (rt->u.dst.dev == dev->nd_net->loopback_dev)) { + /* Accept these IFF target-ip == dev's IP */ + /* TODO: Need to force the ARP response back out the interface + * instead of letting it route locally. + */ + + if (is_ip_on_dev(dev, tip)) { + /* OK, we'll let this special case slide, so that we can + * arp from one local interface to another. This seems + * to work, but could use some review. --Ben + */ + /*printk("arp_filter, sip: %x tip: %x dev: %s, STS override (ip on dev)\n", + sip, tip, dev->name);*/ + } + else { + /*printk("arp_filter, sip: %x tip: %x dev: %s, IP is NOT on dev\n", + sip, tip, dev->name);*/ + NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); + flag = 1; + } + } + else { + /*printk("arp_filter, not lpbk sip: %x tip: %x dev: %s flgs: %hx dst.dev: %p lbk: %p\n", + sip, tip, dev->name, dev->priv_flags, rt->u.dst.dev, + dev->nd_net->loopback_dev);*/ + NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); + flag = 1; + } + if (in_dev) { + in_dev_put(in_dev); + } } ip_rt_put(rt); return flag; @@ -756,8 +809,8 @@ static int arp_process(struct sk_buff *skb) break; } - /* Understand only these message types */ + /* Understand only these message types */ if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) goto out; @@ -819,18 +872,18 @@ static int arp_process(struct sk_buff *skb) addr_type = rt->rt_type; if (addr_type == RTN_LOCAL) { - n = neigh_event_ns(&arp_tbl, sha, &sip, dev); - if (n) { - int dont_send = 0; - - if (!dont_send) - dont_send |= arp_ignore(in_dev,sip,tip); - if (!dont_send && IN_DEV_ARPFILTER(in_dev)) - dont_send |= arp_filter(sip,tip,dev); - if (!dont_send) - arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + int dont_send = 0; - neigh_release(n); + if (!dont_send) + dont_send |= arp_ignore(in_dev,sip,tip); + if (!dont_send && IN_DEV_ARPFILTER(in_dev)) + dont_send |= arp_filter(sip,tip,dev); + if (!dont_send) { + n = neigh_event_ns(&arp_tbl, sha, &sip, dev); + if (n) { + arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + neigh_release(n); + } } goto out; } else if (IN_DEV_FORWARD(in_dev)) { diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 87490f7..1eb63c3 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1454,6 +1454,8 @@ static struct devinet_sysctl_table { "force_igmp_version"), DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, "promote_secondaries"), + DEVINET_SYSCTL_RW_ENTRY(ACCEPT_STS, + "accept_sts"), }, }; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 86ff271..225853c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -260,8 +260,16 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, net = dev->nd_net; if (fib_lookup(net, &fl, &res)) goto last_resort; - if (res.type != RTN_UNICAST) - goto e_inval_res; + if (res.type != RTN_UNICAST) { + if ((res.type == RTN_LOCAL) && + (IN_DEV_ACCEPT_STS(in_dev))) { + /* All is OK */ + } + else { + goto e_inval_res; + } + } + *spec_dst = FIB_RES_PREFSRC(res); fib_combine_itag(itag, &res); #ifdef CONFIG_IP_ROUTE_MULTIPATH diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 19274d0..73c93c4 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -284,7 +284,7 @@ static int fib_default_rules_init(struct fib_rules_ops *ops) { int err; - err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, FIB_RULE_PERMANENT); + err = fib_default_rule_add(ops, 0x100, RT_TABLE_LOCAL, FIB_RULE_PERMANENT); if (err < 0) return err; err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a94f52c..a1fc6c5 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -69,8 +69,23 @@ #define CONFIG_IP_PIMSM 1 #endif -static struct sock *mroute_socket; +struct ipmr_table { + struct list_head list; + struct sock *mroute_socket; + struct vif_device vif_table[MAXVIFS]; /* Devices */ + int maxvif; + int mroute_do_assert; /* Set in PIM assert */ + int mroute_do_pim; + struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */ + struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */ + atomic_t cache_resolve_queue_len; /* Size of unresolved */ + /* Special spinlock for queue of unresolved entries */ + spinlock_t mfc_unres_lock; + int reg_vif_num; + unsigned int id; /* Table ID */ +}; +static int mroute_pim_cnt; /* Big lock, protecting vif table, mrt cache and mroute socket state. Note that the changes are semaphored via rtnl_lock. @@ -82,21 +97,7 @@ static DEFINE_RWLOCK(mrt_lock); * Multicast router control variables */ -static struct vif_device vif_table[MAXVIFS]; /* Devices */ -static int maxvif; - -#define VIF_EXISTS(idx) (vif_table[idx].dev != NULL) - -static int mroute_do_assert; /* Set in PIM assert */ -static int mroute_do_pim; - -static struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */ - -static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */ -static atomic_t cache_resolve_queue_len; /* Size of unresolved */ - -/* Special spinlock for queue of unresolved entries */ -static DEFINE_SPINLOCK(mfc_unres_lock); +#define VIF_EXISTS(table, idx) (table->vif_table[idx].dev != NULL) /* We return to original Alan's scheme. Hash table of resolved entries is changed only in process context and protected @@ -108,9 +109,9 @@ static DEFINE_SPINLOCK(mfc_unres_lock); static struct kmem_cache *mrt_cachep __read_mostly; -static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); -static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); -static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); +static int ip_mr_forward(struct ipmr_table *table, struct sk_buff *skb, struct mfc_cache *cache, int local); +static int ipmr_cache_report(struct ipmr_table *table, struct sk_buff *pkt, vifi_t vifi, int assert); +static int ipmr_fill_mroute(struct ipmr_table *table, struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); #ifdef CONFIG_IP_PIMSM_V2 static struct net_protocol pim_protocol; @@ -118,6 +119,48 @@ static struct net_protocol pim_protocol; static struct timer_list ipmr_expire_timer; +#define IPMR_HSIZE 256 +static struct list_head ipmr_table_hash[IPMR_HSIZE]; +static DEFINE_SPINLOCK(ipmr_hash_lock); + +static struct ipmr_table *ipmr_table_create(unsigned int id) +{ + struct ipmr_table *table; + unsigned int i; + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return table; + spin_lock_init(&table->mfc_unres_lock); + table->id = id; + table->reg_vif_num = -1; + for (i = 0; i < ARRAY_SIZE(table->vif_table); i++) { + table->vif_table[i].vif_index = i; + table->vif_table[i].table_id = id; + } + + spin_lock(&ipmr_hash_lock); + list_add_tail_rcu(&table->list, &ipmr_table_hash[id & (IPMR_HSIZE -1)]); + spin_unlock(&ipmr_hash_lock); + return table; +} + +static struct ipmr_table *ipmr_table_lookup(unsigned int id) +{ + struct ipmr_table *table; + + /* Tables never get freed, so rcu_read_lock() or refcounting is + * unnecessary here. The _rcu variant is just to protect against + * concurrent additions. + */ + list_for_each_entry_rcu(table, &ipmr_table_hash[id & (IPMR_HSIZE - 1)], + list) { + if (table->id == id) + return table; + } + return NULL; +} + /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ static @@ -176,15 +219,20 @@ failure: #ifdef CONFIG_IP_PIMSM -static int reg_vif_num = -1; - static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { + struct ipmr_table *table; + + table = dev->mrt_entry; + if (!table) + goto out; + read_lock(&mrt_lock); ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len; ((struct net_device_stats*)netdev_priv(dev))->tx_packets++; - ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT); + ipmr_cache_report(table, skb, table->reg_vif_num, IGMPMSG_WHOLEPKT); read_unlock(&mrt_lock); +out: kfree_skb(skb); return 0; } @@ -204,12 +252,14 @@ static void reg_vif_setup(struct net_device *dev) dev->destructor = free_netdev; } -static struct net_device *ipmr_reg_vif(void) +static struct net_device *ipmr_reg_vif(struct ipmr_table *table) { struct net_device *dev; struct in_device *in_dev; + char name[IFNAMSIZ]; - dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg", + snprintf(name, sizeof(name), "pimreg%u", table->id); + dev = alloc_netdev(sizeof(struct net_device_stats), name, reg_vif_setup); if (dev == NULL) @@ -250,38 +300,41 @@ failure: * Delete a VIF entry */ -static int vif_delete(int vifi) +static int vif_delete(struct ipmr_table *table, int vifi) { struct vif_device *v; struct net_device *dev; struct in_device *in_dev; - if (vifi < 0 || vifi >= maxvif) + if (vifi < 0 || vifi >= table->maxvif) return -EADDRNOTAVAIL; - v = &vif_table[vifi]; + v = &table->vif_table[vifi]; write_lock_bh(&mrt_lock); dev = v->dev; v->dev = NULL; - if (!dev) { + if (dev) { + dev->mrt_entry = NULL; + } + else { write_unlock_bh(&mrt_lock); return -EADDRNOTAVAIL; } #ifdef CONFIG_IP_PIMSM - if (vifi == reg_vif_num) - reg_vif_num = -1; + if (vifi == table->reg_vif_num) + table->reg_vif_num = -1; #endif - if (vifi+1 == maxvif) { + if (vifi + 1 == table->maxvif) { int tmp; for (tmp=vifi-1; tmp>=0; tmp--) { - if (VIF_EXISTS(tmp)) + if (VIF_EXISTS(table, tmp)) break; } - maxvif = tmp+1; + table->maxvif = tmp + 1; } write_unlock_bh(&mrt_lock); @@ -304,12 +357,12 @@ static int vif_delete(int vifi) and reporting error to netlink readers. */ -static void ipmr_destroy_unres(struct mfc_cache *c) +static void ipmr_destroy_unres(struct ipmr_table *table, struct mfc_cache *c) { struct sk_buff *skb; struct nlmsgerr *e; - atomic_dec(&cache_resolve_queue_len); + atomic_dec(&table->cache_resolve_queue_len); while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { @@ -336,44 +389,56 @@ static void ipmr_expire_process(unsigned long dummy) { unsigned long now; unsigned long expires; + unsigned long interval; struct mfc_cache *c, **cp; + struct ipmr_table *table; + unsigned int i; + int rearm = 0; - if (!spin_trylock(&mfc_unres_lock)) { - mod_timer(&ipmr_expire_timer, jiffies+HZ/10); - return; - } + expires = 10 * HZ; - if (atomic_read(&cache_resolve_queue_len) == 0) - goto out; + for (i = 0; i < IPMR_HSIZE; i++) { + list_for_each_entry_rcu(table, &ipmr_table_hash[i], list) { + if (!spin_trylock(&table->mfc_unres_lock)) + goto next; - now = jiffies; - expires = 10*HZ; - cp = &mfc_unres_queue; + if (atomic_read(&table->cache_resolve_queue_len) == 0) { + spin_unlock(&table->mfc_unres_lock); + continue; + } - while ((c=*cp) != NULL) { - if (time_after(c->mfc_un.unres.expires, now)) { - unsigned long interval = c->mfc_un.unres.expires - now; - if (interval < expires) - expires = interval; - cp = &c->next; - continue; - } + now = jiffies; + cp = &table->mfc_unres_queue; + + while ((c = *cp) != NULL) { + if (time_after(c->mfc_un.unres.expires, now)) { + interval = c->mfc_un.unres.expires - now; + if (interval < expires) + expires = interval; + cp = &c->next; + continue; + } + + *cp = c->next; - *cp = c->next; + ipmr_destroy_unres(table, c); + } - ipmr_destroy_unres(c); + spin_unlock(&table->mfc_unres_lock); +next: + if (atomic_read(&table->cache_resolve_queue_len)) + rearm = 1; + } } - if (atomic_read(&cache_resolve_queue_len)) + if (rearm) mod_timer(&ipmr_expire_timer, jiffies + expires); - -out: - spin_unlock(&mfc_unres_lock); } /* Fill oifs list. It is called under write locked mrt_lock. */ -static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls) +static void ipmr_update_thresholds(struct ipmr_table *table, + struct mfc_cache *cache, unsigned char *ttls) { int vifi; @@ -381,8 +446,8 @@ static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls) cache->mfc_un.res.maxvif = 0; memset(cache->mfc_un.res.ttls, 255, MAXVIFS); - for (vifi=0; vifimaxvif; vifi++) { + if (VIF_EXISTS(table, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) cache->mfc_un.res.minvif = vifi; @@ -392,15 +457,15 @@ static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls) } } -static int vif_add(struct vifctl *vifc, int mrtsock) +static int vif_add(struct ipmr_table *table, struct vifctl *vifc, int mrtsock) { int vifi = vifc->vifc_vifi; - struct vif_device *v = &vif_table[vifi]; + struct vif_device *v = &table->vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; /* Is vif busy ? */ - if (VIF_EXISTS(vifi)) + if (VIF_EXISTS(table, vifi)) return -EADDRINUSE; switch (vifc->vifc_flags) { @@ -410,9 +475,9 @@ static int vif_add(struct vifctl *vifc, int mrtsock) * Special Purpose VIF in PIM * All the packets will be sent to the daemon */ - if (reg_vif_num >= 0) + if (table->reg_vif_num >= 0) return -EADDRINUSE; - dev = ipmr_reg_vif(); + dev = ipmr_reg_vif(table); if (!dev) return -ENOBUFS; break; @@ -426,6 +491,12 @@ static int vif_add(struct vifctl *vifc, int mrtsock) dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr); if (!dev) return -EADDRNOTAVAIL; + if (dev->mrt_entry && (dev->mrt_entry != table)) { + printk("ERROR: Device: %s is already in multicast routing table: %d\n", + dev->name, dev->mrt_entry->id); + return -EADDRNOTAVAIL; + } + dev_put(dev); break; default: @@ -460,22 +531,24 @@ static int vif_add(struct vifctl *vifc, int mrtsock) write_lock_bh(&mrt_lock); dev_hold(dev); v->dev=dev; + dev->mrt_entry = table; #ifdef CONFIG_IP_PIMSM if (v->flags&VIFF_REGISTER) - reg_vif_num = vifi; + table->reg_vif_num = vifi; #endif - if (vifi+1 > maxvif) - maxvif = vifi+1; + if (vifi+1 > table->maxvif) + table->maxvif = vifi+1; write_unlock_bh(&mrt_lock); return 0; } -static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp) +static struct mfc_cache *ipmr_cache_find(struct ipmr_table *table, + __be32 origin, __be32 mcastgrp) { int line=MFC_HASH(mcastgrp,origin); struct mfc_cache *c; - for (c=mfc_cache_array[line]; c; c = c->next) { + for (c = table->mfc_cache_array[line]; c; c = c->next) { if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp) break; } @@ -508,7 +581,8 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void) * A cache entry has gone into a resolved state from queued */ -static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) +static void ipmr_cache_resolve(struct ipmr_table *table, struct mfc_cache *uc, + struct mfc_cache *c) { struct sk_buff *skb; struct nlmsgerr *e; @@ -521,7 +595,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); - if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) { + if (ipmr_fill_mroute(table, skb, c, NLMSG_DATA(nlh)) > 0) { nlh->nlmsg_len = (skb_tail_pointer(skb) - (u8 *)nlh); } else { @@ -535,7 +609,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid); } else - ip_mr_forward(skb, c, 0); + ip_mr_forward(table, skb, c, 0); } } @@ -546,7 +620,8 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) * Called under mrt_lock. */ -static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) +static int ipmr_cache_report(struct ipmr_table *table, struct sk_buff *pkt, + vifi_t vifi, int assert) { struct sk_buff *skb; const int ihl = ip_hdrlen(pkt); @@ -578,7 +653,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); msg->im_msgtype = IGMPMSG_WHOLEPKT; msg->im_mbz = 0; - msg->im_vif = reg_vif_num; + msg->im_vif = table->reg_vif_num; ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); @@ -610,7 +685,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) skb->transport_header = skb->network_header; } - if (mroute_socket == NULL) { + if (table->mroute_socket == NULL) { kfree_skb(skb); return -EINVAL; } @@ -618,7 +693,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) /* * Deliver to mrouted */ - if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) { + if ((ret = sock_queue_rcv_skb(table->mroute_socket, skb)) < 0) { if (net_ratelimit()) printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); kfree_skb(skb); @@ -632,14 +707,14 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) */ static int -ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) +ipmr_cache_unresolved(struct ipmr_table *table, vifi_t vifi, struct sk_buff *skb) { int err; struct mfc_cache *c; const struct iphdr *iph = ip_hdr(skb); - spin_lock_bh(&mfc_unres_lock); - for (c=mfc_unres_queue; c; c=c->next) { + spin_lock_bh(&table->mfc_unres_lock); + for (c = table->mfc_unres_queue; c; c = c->next) { if (c->mfc_mcastgrp == iph->daddr && c->mfc_origin == iph->saddr) break; @@ -650,9 +725,9 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) * Create a new entry if allowable */ - if (atomic_read(&cache_resolve_queue_len)>=10 || + if (atomic_read(&table->cache_resolve_queue_len) >= 10 || (c=ipmr_cache_alloc_unres())==NULL) { - spin_unlock_bh(&mfc_unres_lock); + spin_unlock_bh(&table->mfc_unres_lock); kfree_skb(skb); return -ENOBUFS; @@ -668,20 +743,21 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) /* * Reflect first query at mrouted. */ - if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) { + if ((err = ipmr_cache_report(table, skb, vifi, + IGMPMSG_NOCACHE)) < 0) { /* If the report failed throw the cache entry out - Brad Parker */ - spin_unlock_bh(&mfc_unres_lock); + spin_unlock_bh(&table->mfc_unres_lock); kmem_cache_free(mrt_cachep, c); kfree_skb(skb); return err; } - atomic_inc(&cache_resolve_queue_len); - c->next = mfc_unres_queue; - mfc_unres_queue = c; + atomic_inc(&table->cache_resolve_queue_len); + c->next = table->mfc_unres_queue; + table->mfc_unres_queue = c; mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires); } @@ -697,7 +773,7 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) err = 0; } - spin_unlock_bh(&mfc_unres_lock); + spin_unlock_bh(&table->mfc_unres_lock); return err; } @@ -705,14 +781,15 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) * MFC cache manipulation by user space mroute daemon */ -static int ipmr_mfc_delete(struct mfcctl *mfc) +static int ipmr_mfc_delete(struct ipmr_table *table, struct mfcctl *mfc) { int line; struct mfc_cache *c, **cp; line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); - for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) { + for (cp = &table->mfc_cache_array[line]; (c = *cp) != NULL; + cp = &c->next) { if (c->mfc_origin == mfc->mfcc_origin.s_addr && c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { write_lock_bh(&mrt_lock); @@ -726,14 +803,15 @@ static int ipmr_mfc_delete(struct mfcctl *mfc) return -ENOENT; } -static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) +static int ipmr_mfc_add(struct ipmr_table *table, struct mfcctl *mfc, int mrtsock) { int line; struct mfc_cache *uc, *c, **cp; line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); - for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) { + for (cp = &table->mfc_cache_array[line]; (c = *cp) != NULL; + cp = &c->next) { if (c->mfc_origin == mfc->mfcc_origin.s_addr && c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) break; @@ -742,7 +820,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) if (c != NULL) { write_lock_bh(&mrt_lock); c->mfc_parent = mfc->mfcc_parent; - ipmr_update_thresholds(c, mfc->mfcc_ttls); + ipmr_update_thresholds(table, c, mfc->mfcc_ttls); if (!mrtsock) c->mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); @@ -759,34 +837,34 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) c->mfc_origin=mfc->mfcc_origin.s_addr; c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr; c->mfc_parent=mfc->mfcc_parent; - ipmr_update_thresholds(c, mfc->mfcc_ttls); + ipmr_update_thresholds(table, c, mfc->mfcc_ttls); if (!mrtsock) c->mfc_flags |= MFC_STATIC; write_lock_bh(&mrt_lock); - c->next = mfc_cache_array[line]; - mfc_cache_array[line] = c; + c->next = table->mfc_cache_array[line]; + table->mfc_cache_array[line] = c; write_unlock_bh(&mrt_lock); /* * Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ - spin_lock_bh(&mfc_unres_lock); - for (cp = &mfc_unres_queue; (uc=*cp) != NULL; + spin_lock_bh(&table->mfc_unres_lock); + for (cp = &table->mfc_unres_queue; (uc = *cp) != NULL; cp = &uc->next) { if (uc->mfc_origin == c->mfc_origin && uc->mfc_mcastgrp == c->mfc_mcastgrp) { *cp = uc->next; - if (atomic_dec_and_test(&cache_resolve_queue_len)) + if (atomic_dec_and_test(&table->cache_resolve_queue_len)) del_timer(&ipmr_expire_timer); break; } } - spin_unlock_bh(&mfc_unres_lock); + spin_unlock_bh(&table->mfc_unres_lock); if (uc) { - ipmr_cache_resolve(uc, c); + ipmr_cache_resolve(table, uc, c); kmem_cache_free(mrt_cachep, uc); } return 0; @@ -796,16 +874,16 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) * Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct sock *sk) +static void mroute_clean_tables(struct ipmr_table *table, struct sock *sk) { int i; /* * Shut down all active vif entries */ - for (i=0; imaxvif; i++) { + if (!(table->vif_table[i].flags&VIFF_STATIC)) + vif_delete(table, i); } /* @@ -814,7 +892,7 @@ static void mroute_clean_tables(struct sock *sk) for (i=0;imfc_cache_array[i]; while ((c = *cp) != NULL) { if (c->mfc_flags&MFC_STATIC) { cp = &c->next; @@ -828,34 +906,41 @@ static void mroute_clean_tables(struct sock *sk) } } - if (atomic_read(&cache_resolve_queue_len) != 0) { + if (atomic_read(&table->cache_resolve_queue_len) != 0) { struct mfc_cache *c; - spin_lock_bh(&mfc_unres_lock); - while (mfc_unres_queue != NULL) { - c = mfc_unres_queue; - mfc_unres_queue = c->next; - spin_unlock_bh(&mfc_unres_lock); + spin_lock_bh(&table->mfc_unres_lock); + while (table->mfc_unres_queue != NULL) { + c = table->mfc_unres_queue; + table->mfc_unres_queue = c->next; + spin_unlock_bh(&table->mfc_unres_lock); - ipmr_destroy_unres(c); + ipmr_destroy_unres(table, c); - spin_lock_bh(&mfc_unres_lock); + spin_lock_bh(&table->mfc_unres_lock); } - spin_unlock_bh(&mfc_unres_lock); + spin_unlock_bh(&table->mfc_unres_lock); } } static void mrtsock_destruct(struct sock *sk) { + struct ipmr_table *table; + unsigned int i; + rtnl_lock(); - if (sk == mroute_socket) { - IPV4_DEVCONF_ALL(sk->sk_net, MC_FORWARDING)--; + for (i = 0; i < IPMR_HSIZE; i++) { + list_for_each_entry_rcu(table, &ipmr_table_hash[i], list) { + if (sk == table->mroute_socket) { + IPV4_DEVCONF_ALL(sk->sk_net, MC_FORWARDING)--; - write_lock_bh(&mrt_lock); - mroute_socket=NULL; - write_unlock_bh(&mrt_lock); + write_lock_bh(&mrt_lock); + table->mroute_socket = NULL; + write_unlock_bh(&mrt_lock); - mroute_clean_tables(sk); + mroute_clean_tables(table, sk); + } + } } rtnl_unlock(); } @@ -872,9 +957,57 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt int ret; struct vifctl vif; struct mfcctl mfc; + struct ipmr_table *table; + unsigned int table_id = DFLT_MROUTE_TBL; + + switch (optname) { + case MRT_INIT: + case MRT_DONE: + case MRT_ASSERT: +#ifdef CONFIG_IP_PIMSM + case MRT_PIM: +#endif + if (optlen == sizeof(struct mrt_sockopt_simple)) { + struct mrt_sockopt_simple tmp; + if (copy_from_user(&tmp, optval, sizeof(tmp))) + return -EFAULT; + table_id = tmp.table_id; + optlen = sizeof(int); + } + break; + case MRT_ADD_VIF: + case MRT_DEL_VIF: + if (optlen == sizeof(struct vifctl_ng)) { + struct vifctl_ng tmp; + if (copy_from_user(&tmp, optval, sizeof(tmp))) + return -EFAULT; + table_id = tmp.table_id; + optlen = sizeof(vif); + } + break; + case MRT_ADD_MFC: + case MRT_DEL_MFC: + if (optlen == sizeof(struct mfcctl_ng)) { + struct mfcctl_ng tmp; + if (copy_from_user(&tmp, optval, sizeof(tmp))) + return -EFAULT; + table_id = tmp.table_id; + optlen = sizeof(mfc); + } + } + + table = ipmr_table_lookup(table_id); + if (!table) { + if (optname == MRT_INIT) { + table = ipmr_table_create(table_id); + } + } + + if (!table) + return -ENOENT; if (optname != MRT_INIT) { - if (sk != mroute_socket && !capable(CAP_NET_ADMIN)) + if (sk != table->mroute_socket && !capable(CAP_NET_ADMIN)) return -EACCES; } @@ -887,7 +1020,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt return -ENOPROTOOPT; rtnl_lock(); - if (mroute_socket) { + if (table->mroute_socket) { rtnl_unlock(); return -EADDRINUSE; } @@ -895,7 +1028,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt ret = ip_ra_control(sk, 1, mrtsock_destruct); if (ret == 0) { write_lock_bh(&mrt_lock); - mroute_socket=sk; + table->mroute_socket = sk; write_unlock_bh(&mrt_lock); IPV4_DEVCONF_ALL(sk->sk_net, MC_FORWARDING)++; @@ -903,7 +1036,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt rtnl_unlock(); return ret; case MRT_DONE: - if (sk!=mroute_socket) + if (sk != table->mroute_socket) return -EACCES; return ip_ra_control(sk, 0, NULL); case MRT_ADD_VIF: @@ -916,9 +1049,9 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt return -ENFILE; rtnl_lock(); if (optname==MRT_ADD_VIF) { - ret = vif_add(&vif, sk==mroute_socket); + ret = vif_add(table, &vif, sk == table->mroute_socket); } else { - ret = vif_delete(vif.vifc_vifi); + ret = vif_delete(table, vif.vifc_vifi); } rtnl_unlock(); return ret; @@ -935,9 +1068,10 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt return -EFAULT; rtnl_lock(); if (optname==MRT_DEL_MFC) - ret = ipmr_mfc_delete(&mfc); + ret = ipmr_mfc_delete(table, &mfc); else - ret = ipmr_mfc_add(&mfc, sk==mroute_socket); + ret = ipmr_mfc_add(table, &mfc, + sk == table->mroute_socket); rtnl_unlock(); return ret; /* @@ -948,7 +1082,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt int v; if (get_user(v,(int __user *)optval)) return -EFAULT; - mroute_do_assert=(v)?1:0; + table->mroute_do_assert = v ? 1 : 0; return 0; } #ifdef CONFIG_IP_PIMSM @@ -962,19 +1096,25 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt rtnl_lock(); ret = 0; - if (v != mroute_do_pim) { - mroute_do_pim = v; - mroute_do_assert = v; + if (v != table->mroute_do_pim) { + if (v != table->mroute_do_pim) { + if (v) + mroute_pim_cnt++; + else + mroute_pim_cnt--; #ifdef CONFIG_IP_PIMSM_V2 - if (mroute_do_pim) - ret = inet_add_protocol(&pim_protocol, - IPPROTO_PIM); - else - ret = inet_del_protocol(&pim_protocol, - IPPROTO_PIM); - if (ret < 0) - ret = -EAGAIN; + if (mroute_pim_cnt == 1) + ret = inet_add_protocol(&pim_protocol, + IPPROTO_PIM); + else if (mroute_pim_cnt == 0) + ret = inet_del_protocol(&pim_protocol, + IPPROTO_PIM); + if (ret < 0) + ret = -EAGAIN; #endif + } + table->mroute_do_pim = v; + table->mroute_do_assert = v; } rtnl_unlock(); return ret; @@ -995,6 +1135,8 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen) { + struct ipmr_table *table; + unsigned int table_id = DFLT_MROUTE_TBL; int olr; int val; @@ -1008,20 +1150,31 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __u if (get_user(olr, optlen)) return -EFAULT; - olr = min_t(unsigned int, olr, sizeof(int)); + olr = min_t(unsigned int, olr, sizeof(struct mrt_sockopt_simple)); if (olr < 0) return -EINVAL; + if (olr == sizeof(struct mrt_sockopt_simple)) { + struct mrt_sockopt_simple tmp; + if (copy_from_user(&tmp, optval, sizeof(tmp))) + return -EFAULT; + table_id = tmp.table_id; + } + + table = ipmr_table_lookup(table_id); + if (!table) + return -ENOENT; + if (put_user(olr,optlen)) return -EFAULT; if (optname==MRT_VERSION) val=0x0305; #ifdef CONFIG_IP_PIMSM else if (optname==MRT_PIM) - val=mroute_do_pim; + val = table->mroute_do_pim; #endif else - val=mroute_do_assert; + val = table->mroute_do_assert; if (copy_to_user(optval,&val,olr)) return -EFAULT; return 0; @@ -1034,19 +1187,42 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __u int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) { struct sioc_sg_req sr; + struct sioc_sg_req_ng sr_ng; struct sioc_vif_req vr; + struct sioc_vif_req_ng vr_ng; struct vif_device *vif; struct mfc_cache *c; + struct ipmr_table *table; + unsigned int table_id = DFLT_MROUTE_TBL; + + switch (cmd) { + case SIOCGETVIFCNT_NG: + if (copy_from_user(&vr_ng, arg, sizeof(vr_ng))) + return -EFAULT; + table_id = vr_ng.table_id; + cmd = SIOCGETVIFCNT; + break; + case SIOCGETSGCNT_NG: + if (copy_from_user(&sr_ng, arg, sizeof(sr_ng))) + return -EFAULT; + table_id = sr_ng.table_id; + cmd = SIOCGETSGCNT; + break; + } + + table = ipmr_table_lookup(table_id); + if (!table) + return -ENOENT; switch (cmd) { case SIOCGETVIFCNT: if (copy_from_user(&vr,arg,sizeof(vr))) return -EFAULT; - if (vr.vifi>=maxvif) + if (vr.vifi >= table->maxvif) return -EINVAL; read_lock(&mrt_lock); - vif=&vif_table[vr.vifi]; - if (VIF_EXISTS(vr.vifi)) { + vif = &table->vif_table[vr.vifi]; + if (VIF_EXISTS(table, vr.vifi)) { vr.icount=vif->pkt_in; vr.ocount=vif->pkt_out; vr.ibytes=vif->bytes_in; @@ -1064,7 +1240,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) return -EFAULT; read_lock(&mrt_lock); - c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr); + c = ipmr_cache_find(table, sr.src.s_addr, sr.grp.s_addr); if (c) { sr.pktcnt = c->mfc_un.res.pkt; sr.bytecnt = c->mfc_un.res.bytes; @@ -1087,6 +1263,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v { struct net_device *dev = ptr; struct vif_device *v; + struct ipmr_table *table; int ct; if (dev->nd_net != &init_net) @@ -1094,10 +1271,15 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; - v=&vif_table[0]; - for (ct=0;ctmrt_entry; + if (!table) + return NOTIFY_DONE; + + v = &table->vif_table[0]; + for (ct = 0; ct < table->maxvif; ct++, v++) { if (v->dev==dev) - vif_delete(ct); + vif_delete(table, ct); } return NOTIFY_DONE; } @@ -1155,10 +1337,11 @@ static inline int ipmr_forward_finish(struct sk_buff *skb) * Processing handlers for ipmr_forward */ -static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) +static void ipmr_queue_xmit(struct ipmr_table *table, struct sk_buff *skb, + struct mfc_cache *c, int vifi) { const struct iphdr *iph = ip_hdr(skb); - struct vif_device *vif = &vif_table[vifi]; + struct vif_device *vif = &table->vif_table[vifi]; struct net_device *dev; struct rtable *rt; int encap = 0; @@ -1172,7 +1355,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) vif->bytes_out+=skb->len; ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len; ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++; - ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT); + ipmr_cache_report(table, skb, vifi, IGMPMSG_WHOLEPKT); kfree_skb(skb); return; } @@ -1256,11 +1439,12 @@ out_free: return; } -static int ipmr_find_vif(struct net_device *dev) +static int ipmr_find_vif(struct ipmr_table *table, struct net_device *dev) { int ct; - for (ct=maxvif-1; ct>=0; ct--) { - if (vif_table[ct].dev == dev) + + for (ct = table->maxvif - 1; ct >= 0; ct--) { + if (table->vif_table[ct].dev == dev) break; } return ct; @@ -1268,7 +1452,8 @@ static int ipmr_find_vif(struct net_device *dev) /* "local" means that we should preserve one skb (for local delivery) */ -static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local) +static int ip_mr_forward(struct ipmr_table *table, struct sk_buff *skb, + struct mfc_cache *cache, int local) { int psend = -1; int vif, ct; @@ -1280,7 +1465,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local /* * Wrong interface: drop packet and (maybe) send PIM assert. */ - if (vif_table[vif].dev != skb->dev) { + if (table->vif_table[vif].dev != skb->dev) { int true_vifi; if (((struct rtable*)skb->dst)->fl.iif == 0) { @@ -1299,25 +1484,26 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local } cache->mfc_un.res.wrong_if++; - true_vifi = ipmr_find_vif(skb->dev); + true_vifi = ipmr_find_vif(table, skb->dev); - if (true_vifi >= 0 && mroute_do_assert && + if (true_vifi >= 0 && table->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, so that we cannot check that packet arrived on an oif. It is bad, but otherwise we would need to move pretty large chunk of pimd to kernel. Ough... --ANK */ - (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) && + (table->mroute_do_pim || + cache->mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { cache->mfc_un.res.last_assert = jiffies; - ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF); + ipmr_cache_report(table, skb, true_vifi, IGMPMSG_WRONGVIF); } goto dont_forward; } - vif_table[vif].pkt_in++; - vif_table[vif].bytes_in+=skb->len; + table->vif_table[vif].pkt_in++; + table->vif_table[vif].bytes_in += skb->len; /* * Forward the frame @@ -1327,7 +1513,8 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(skb2, cache, psend); + ipmr_queue_xmit(table, skb2, cache, + psend); } psend=ct; } @@ -1336,9 +1523,9 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(skb2, cache, psend); + ipmr_queue_xmit(table, skb2, cache, psend); } else { - ipmr_queue_xmit(skb, cache, psend); + ipmr_queue_xmit(table, skb, cache, psend); return 0; } } @@ -1358,6 +1545,10 @@ int ip_mr_input(struct sk_buff *skb) { struct mfc_cache *cache; int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL; + struct ipmr_table *table = skb->dev->mrt_entry; + + if (!table) + goto drop; /* Packet is looped back after forward, it should not be forwarded second time, but still can be delivered locally. @@ -1377,9 +1568,9 @@ int ip_mr_input(struct sk_buff *skb) that we can forward NO IGMP messages. */ read_lock(&mrt_lock); - if (mroute_socket) { + if (table->mroute_socket) { nf_reset(skb); - raw_rcv(mroute_socket, skb); + raw_rcv(table->mroute_socket, skb); read_unlock(&mrt_lock); return 0; } @@ -1388,7 +1579,7 @@ int ip_mr_input(struct sk_buff *skb) } read_lock(&mrt_lock); - cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); + cache = ipmr_cache_find(table, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); /* * No usable cache entry @@ -1406,9 +1597,9 @@ int ip_mr_input(struct sk_buff *skb) skb = skb2; } - vif = ipmr_find_vif(skb->dev); + vif = ipmr_find_vif(table, skb->dev); if (vif >= 0) { - int err = ipmr_cache_unresolved(vif, skb); + int err = ipmr_cache_unresolved(table, vif, skb); read_unlock(&mrt_lock); return err; @@ -1418,7 +1609,7 @@ int ip_mr_input(struct sk_buff *skb) return -ENODEV; } - ip_mr_forward(skb, cache, local); + ip_mr_forward(table, skb, cache, local); read_unlock(&mrt_lock); @@ -1430,6 +1621,7 @@ int ip_mr_input(struct sk_buff *skb) dont_forward: if (local) return ip_local_deliver(skb); +drop: kfree_skb(skb); return 0; } @@ -1444,13 +1636,18 @@ int pim_rcv_v1(struct sk_buff * skb) struct igmphdr *pim; struct iphdr *encap; struct net_device *reg_dev = NULL; + struct ipmr_table *table; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) goto drop; pim = igmp_hdr(skb); - if (!mroute_do_pim || + table = skb->dev->mrt_entry; + if (!table) + goto drop; + + if (!table->mroute_do_pim || skb->len < sizeof(*pim) + sizeof(*encap) || pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) goto drop; @@ -1469,8 +1666,8 @@ int pim_rcv_v1(struct sk_buff * skb) goto drop; read_lock(&mrt_lock); - if (reg_vif_num >= 0) - reg_dev = vif_table[reg_vif_num].dev; + if (table->reg_vif_num >= 0) + reg_dev = table->vif_table[table->reg_vif_num].dev; if (reg_dev) dev_hold(reg_dev); read_unlock(&mrt_lock); @@ -1505,6 +1702,7 @@ static int pim_rcv(struct sk_buff * skb) struct pimreghdr *pim; struct iphdr *encap; struct net_device *reg_dev = NULL; + struct ipmr_table *table; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) goto drop; @@ -1524,9 +1722,13 @@ static int pim_rcv(struct sk_buff * skb) ntohs(encap->tot_len) + sizeof(*pim) > skb->len) goto drop; + table = skb->dev->mrt_entry; + if (!table) + goto drop; + read_lock(&mrt_lock); - if (reg_vif_num >= 0) - reg_dev = vif_table[reg_vif_num].dev; + if (table->reg_vif_num >= 0) + reg_dev = table->vif_table[table->reg_vif_num].dev; if (reg_dev) dev_hold(reg_dev); read_unlock(&mrt_lock); @@ -1556,11 +1758,12 @@ static int pim_rcv(struct sk_buff * skb) #endif static int -ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) +ipmr_fill_mroute(struct ipmr_table *table, struct sk_buff *skb, + struct mfc_cache *c, struct rtmsg *rtm) { int ct; struct rtnexthop *nhp; - struct net_device *dev = vif_table[c->mfc_parent].dev; + struct net_device *dev = table->vif_table[c->mfc_parent].dev; u8 *b = skb_tail_pointer(skb); struct rtattr *mp_head; @@ -1576,7 +1779,7 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); nhp->rtnh_flags = 0; nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; - nhp->rtnh_ifindex = vif_table[ct].dev->ifindex; + nhp->rtnh_ifindex = table->vif_table[ct].dev->ifindex; nhp->rtnh_len = sizeof(*nhp); } } @@ -1595,9 +1798,20 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) int err; struct mfc_cache *cache; struct rtable *rt = (struct rtable*)skb->dst; + struct ipmr_table *table; + struct net_device *dev; + + dev = dev_get_by_index(&init_net, rt->fl.iif); + if (!dev) + return -ENODEV; + + table = dev->mrt_entry; + dev_put(dev); + if (!table) + return -ENOENT; read_lock(&mrt_lock); - cache = ipmr_cache_find(rt->rt_src, rt->rt_dst); + cache = ipmr_cache_find(table, rt->rt_src, rt->rt_dst); if (cache==NULL) { struct sk_buff *skb2; @@ -1611,7 +1825,7 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) } dev = skb->dev; - if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) { + if (dev == NULL || (vif = ipmr_find_vif(table, dev)) < 0) { read_unlock(&mrt_lock); return -ENODEV; } @@ -1628,14 +1842,14 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; iph->version = 0; - err = ipmr_cache_unresolved(vif, skb2); + err = ipmr_cache_unresolved(table, vif, skb2); read_unlock(&mrt_lock); return err; } if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) cache->mfc_flags |= MFC_NOTIFY; - err = ipmr_fill_mroute(skb, cache, rtm); + err = ipmr_fill_mroute(table, skb, cache, rtm); read_unlock(&mrt_lock); return err; } @@ -1645,17 +1859,26 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif */ struct ipmr_vif_iter { - int ct; + unsigned int bucket; + struct ipmr_table *table; + int ct; }; static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter, loff_t pos) { - for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) { - if (!VIF_EXISTS(iter->ct)) - continue; - if (pos-- == 0) - return &vif_table[iter->ct]; + for (iter->bucket = 0; iter->bucket < IPMR_HSIZE; iter->bucket++) { + list_for_each_entry_rcu(iter->table, + &ipmr_table_hash[iter->bucket], + list) { + for (iter->ct = 0; iter->ct < iter->table->maxvif; + ++iter->ct) { + if (!VIF_EXISTS(iter->table, iter->ct)) + continue; + if (pos-- == 0) + return &iter->table->vif_table[iter->ct]; + } + } } return NULL; } @@ -1676,11 +1899,27 @@ static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (v == SEQ_START_TOKEN) return ipmr_vif_seq_idx(iter, 0); - while (++iter->ct < maxvif) { - if (!VIF_EXISTS(iter->ct)) +next_vif: + while (++iter->ct < iter->table->maxvif) { + if (!VIF_EXISTS(iter->table, iter->ct)) continue; - return &vif_table[iter->ct]; + return &iter->table->vif_table[iter->ct]; + } + +next_table: + if (iter->table->list.next != &ipmr_table_hash[iter->bucket]) { + iter->table = list_entry(iter->table->list.next, + struct ipmr_table, list); + iter->ct = -1; + goto next_vif; + } + + while (++iter->bucket < IPMR_HSIZE) { + iter->table = list_entry(&ipmr_table_hash[iter->bucket], + struct ipmr_table, list); + goto next_table; } + return NULL; } @@ -1694,17 +1933,17 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, - "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); + "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote TableId\n"); } else { const struct vif_device *vif = v; const char *name = vif->dev ? vif->dev->name : "none"; seq_printf(seq, - "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", - vif - vif_table, + "%2d %-10s %8ld %7ld %8ld %7ld %05X %08X %08X %d\n", + vif->vif_index, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, - vif->flags, vif->local, vif->remote); + vif->flags, vif->local, vif->remote, vif->table_id); } return 0; } @@ -1731,8 +1970,10 @@ static const struct file_operations ipmr_vif_fops = { }; struct ipmr_mfc_iter { - struct mfc_cache **cache; - int ct; + unsigned int bucket; + struct ipmr_table *table; + struct mfc_cache **cache; + int ct; }; @@ -1740,22 +1981,29 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos) { struct mfc_cache *mfc; - it->cache = mfc_cache_array; - read_lock(&mrt_lock); - for (it->ct = 0; it->ct < MFC_LINES; it->ct++) - for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) - if (pos-- == 0) - return mfc; - read_unlock(&mrt_lock); - - it->cache = &mfc_unres_queue; - spin_lock_bh(&mfc_unres_lock); - for (mfc = mfc_unres_queue; mfc; mfc = mfc->next) - if (pos-- == 0) - return mfc; - spin_unlock_bh(&mfc_unres_lock); + for (it->bucket = 0; it->bucket < IPMR_HSIZE; it->bucket++) { + list_for_each_entry_rcu(it->table, + &ipmr_table_hash[it->bucket], + list) { + it->cache = it->table->mfc_cache_array; + read_lock(&mrt_lock); + for (it->ct = 0; it->ct < MFC_LINES; it->ct++) + for (mfc = it->table->mfc_cache_array[it->ct]; + mfc; mfc = mfc->next) + if (pos-- == 0) + return mfc; + read_unlock(&mrt_lock); - it->cache = NULL; + it->cache = &it->table->mfc_unres_queue; + spin_lock_bh(&it->table->mfc_unres_lock); + for (mfc = it->table->mfc_unres_queue; mfc; + mfc = mfc->next) + if (pos-- == 0) + return mfc; + spin_unlock_bh(&it->table->mfc_unres_lock); + it->cache = NULL; + } + } return NULL; } @@ -1782,31 +2030,48 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (mfc->next) return mfc->next; - if (it->cache == &mfc_unres_queue) + next_mfc: + if (it->cache == &it->table->mfc_unres_queue) goto end_of_list; - BUG_ON(it->cache != mfc_cache_array); + BUG_ON(it->cache != it->table->mfc_cache_array); while (++it->ct < MFC_LINES) { - mfc = mfc_cache_array[it->ct]; + mfc = it->table->mfc_cache_array[it->ct]; if (mfc) return mfc; } /* exhausted cache_array, show unresolved */ read_unlock(&mrt_lock); - it->cache = &mfc_unres_queue; + it->cache = &it->table->mfc_unres_queue; it->ct = 0; - spin_lock_bh(&mfc_unres_lock); - mfc = mfc_unres_queue; + spin_lock_bh(&it->table->mfc_unres_lock); + mfc = it->table->mfc_unres_queue; if (mfc) return mfc; end_of_list: - spin_unlock_bh(&mfc_unres_lock); + spin_unlock_bh(&it->table->mfc_unres_lock); it->cache = NULL; + next_table: + if (it->table->list.next != &ipmr_table_hash[it->bucket]) { + it->table = list_entry(it->table->list.next, + struct ipmr_table, list); + it->ct = -1; + read_lock(&mrt_lock); + it->cache = it->table->mfc_cache_array; + goto next_mfc; + } + + while (++it->bucket < IPMR_HSIZE) { + it->table = list_entry(&ipmr_table_hash[it->bucket], + struct ipmr_table, list); + goto next_table; + } + return NULL; } @@ -1814,9 +2079,9 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) { struct ipmr_mfc_iter *it = seq->private; - if (it->cache == &mfc_unres_queue) - spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == mfc_cache_array) + if (it->cache == &it->table->mfc_unres_queue) + spin_unlock_bh(&it->table->mfc_unres_lock); + else if (it->cache == it->table->mfc_cache_array) read_unlock(&mrt_lock); } @@ -1826,23 +2091,24 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) { seq_puts(seq, - "Group Origin Iif Pkts Bytes Wrong Oifs\n"); + "Group Origin Iif Pkts Bytes Wrong-Oifs TableId\n"); } else { const struct mfc_cache *mfc = v; const struct ipmr_mfc_iter *it = seq->private; - seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld", + seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld %u", (unsigned long) mfc->mfc_mcastgrp, (unsigned long) mfc->mfc_origin, mfc->mfc_parent, mfc->mfc_un.res.pkt, mfc->mfc_un.res.bytes, - mfc->mfc_un.res.wrong_if); + mfc->mfc_un.res.wrong_if, + it->table->id); - if (it->cache != &mfc_unres_queue) { + if (it->cache != &it->table->mfc_unres_queue) { for (n = mfc->mfc_un.res.minvif; n < mfc->mfc_un.res.maxvif; n++ ) { - if (VIF_EXISTS(n) + if (VIF_EXISTS(it->table, n) && mfc->mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", @@ -1889,6 +2155,19 @@ static struct net_protocol pim_protocol = { void __init ip_mr_init(void) { + unsigned int i; + + for (i = 0; i < IPMR_HSIZE; i++) + INIT_LIST_HEAD(&ipmr_table_hash[i]); + ipmr_table_create(DFLT_MROUTE_TBL); + + printk("mroute ioctl struct sizes: mfcctl: %i mfcctl_ng: %i mrt_sockopt_simple: %i" + " sioc_sg_req: %i sioc_sg_req_ng: %i sioc_vif_req: %i sioc_vif_req_ng: %i\n", + (int)(sizeof(struct mfcctl)), (int)(sizeof(struct mfcctl_ng)), + (int)(sizeof(struct mrt_sockopt_simple)), + (int)(sizeof(struct sioc_sg_req)), (int)(sizeof(struct sioc_sg_req_ng)), + (int)(sizeof(struct sioc_vif_req)), (int)(sizeof(struct sioc_vif_req_ng))); + mrt_cachep = kmem_cache_create("ip_mrt_cache", sizeof(struct mfc_cache), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 4618ea0..eb0185e 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -52,9 +52,10 @@ static int ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, static int ipv4_print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple) { - return seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ", + return seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u mark=%u ", NIPQUAD(tuple->src.u3.ip), - NIPQUAD(tuple->dst.u3.ip)); + NIPQUAD(tuple->dst.u3.ip), + tuple->mark); } /* Returns new sk_buff, or NULL */ diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 36b4e3b..2e77e03 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -81,7 +81,7 @@ hash_by_src(const struct nf_conntrack_tuple *tuple) /* Original src, to ensure we map it consistently if poss. */ hash = jhash_3words((__force u32)tuple->src.u3.ip, - (__force u32)tuple->src.u.all, + (__force u32)tuple->src.u.all ^ tuple->mark, tuple->dst.protonum, 0); return ((u64)hash * nf_nat_htable_size) >> 32; } @@ -140,7 +140,8 @@ same_src(const struct nf_conn *ct, t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; return (t->dst.protonum == tuple->dst.protonum && t->src.u3.ip == tuple->src.u3.ip && - t->src.u.all == tuple->src.u.all); + t->src.u.all == tuple->src.u.all && + t->mark == tuple->mark); } /* Only called for SRC manip */ @@ -213,7 +214,7 @@ find_best_ips_proto(struct nf_conntrack_tuple *tuple, minip = ntohl(range->min_ip); maxip = ntohl(range->max_ip); j = jhash_2words((__force u32)tuple->src.u3.ip, - (__force u32)tuple->dst.u3.ip, 0); + (__force u32)tuple->dst.u3.ip ^ tuple->mark, 0); j = ((u64)j * (maxip - minip + 1)) >> 32; *var_ipp = htonl(minip + j); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 00156bf..e2bd7ce 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -597,6 +597,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) ip_hdr(skb)->saddr, /* XXX */ sizeof(struct tcphdr), IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; + arg.bound_dev_if = ((struct rtable *)skb->dst)->fl.iif; ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 803d758..3441204 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -80,7 +80,9 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) if (tcp_too_many_orphans(sk, orphans)) { if (net_ratelimit()) - printk(KERN_INFO "Out of socket memory\n"); + printk(KERN_INFO "Out of socket memory, orphans: %d/%d tcp_memory_allocated: %d/%d\n", + orphans, sysctl_tcp_max_orphans, atomic_read(&tcp_memory_allocated), + sysctl_tcp_mem[2]); /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 47263e4..36e7621 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -19,6 +19,9 @@ config IPV6 To compile this protocol support as a module, choose M here: the module will be called ipv6. +config IPV6_REF_DEBUG + bool "IPv6: refcount debugging" + config IPV6_PRIVACY bool "IPv6: Privacy Extensions support" depends on IPV6 diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 24f3aa0..c047de4 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -35,5 +35,6 @@ obj-$(CONFIG_IPV6_SIT) += sit.o obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o obj-y += addrconf_core.o exthdrs_core.o +obj-$(CONFIG_IPV6_REF_DEBUG) += debug.o obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e08955b..29dccba 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -146,6 +146,7 @@ static void addrconf_dad_run(struct inet6_dev *idev); static void addrconf_rs_timer(unsigned long data); static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); +static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa); static void inet6_prefix_notify(int event, struct inet6_dev *idev, struct prefix_info *pinfo); @@ -185,6 +186,7 @@ struct ipv6_devconf ipv6_devconf __read_mostly = { #endif .proxy_ndp = 0, .accept_source_route = 0, /* we do not accept RH0 by default. */ + .nlnotify_on_addr_add = 0, /* by default, only notify when it becomes un-tentative */ }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -217,6 +219,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { #endif .proxy_ndp = 0, .accept_source_route = 0, /* we do not accept RH0 by default. */ + .nlnotify_on_addr_add = 0, /* by default, only notify when it becomes un-tentative */ }; /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ @@ -641,6 +644,13 @@ out2: ifa = ERR_PTR(err); } + /* Allow netlink notification of all addresses, regardless of flags. */ + if (ipv6_devconf.nlnotify_on_addr_add || idev->cnf.nlnotify_on_addr_add) { + if (!IS_ERR(ifa)) { + inet6_ifa_notify(RTM_NEWADDR, ifa); + } + } + return ifa; out: write_unlock(&addrconf_hash_lock); @@ -2549,6 +2559,9 @@ static int addrconf_ifdown(struct net_device *dev, int how) else ipv6_mc_down(idev); + /*if (how) + ipv6_ac_destroy_dev(idev);*/ + idev->tstamp = jiffies; /* Shot the device (if unregistered) */ @@ -3556,6 +3569,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, #ifdef CONFIG_IPV6_OPTIMISTIC_DAD array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad; #endif + array[DEVCONF_NLNOTIFY_ON_ADDR_ADD] = cnf->nlnotify_on_addr_add; } static inline size_t inet6_if_nlmsg_size(void) @@ -4094,6 +4108,15 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = NET_IPV6_NLNOTIFY_ON_ADDR_ADD, + .procname = "nlnotify_on_addr_add", + .data = &ipv6_devconf.nlnotify_on_addr_add, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + + }, #ifdef CONFIG_IPV6_OPTIMISTIC_DAD { .ctl_name = CTL_UNNUMBERED, diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index e5f56c9..54c9b00 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -382,6 +382,34 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr) return 0; } +/* + +Works around some leaks, ie like when I used ipv6_ifa_notify instead of inet6_ifa_notify. +Shouldn't be needed in a properly working system. +--Ben + +void ipv6_ac_destroy_dev(struct inet6_dev *idev) +{ + struct ifacaddr6 *aca; + + write_lock_bh(&idev->lock); + while ((aca = idev->ac_list) != NULL) { + idev->ac_list = aca->aca_next; + write_unlock_bh(&idev->lock); + + addrconf_leave_solict(idev, &aca->aca_addr); + + dst_hold(&aca->aca_rt->u.dst); + ip6_del_rt(aca->aca_rt); + + aca_put(aca); + + write_lock_bh(&idev->lock); + } + write_unlock_bh(&idev->lock); +} +*/ + static int ipv6_dev_ac_dec(struct net_device *dev, struct in6_addr *addr) { int ret; diff --git a/net/ipv6/debug.c b/net/ipv6/debug.c new file mode 100644 index 0000000..94a62d3 --- /dev/null +++ b/net/ipv6/debug.c @@ -0,0 +1,86 @@ +#include +#include + +static int ip6_ref_debug = 1; + +void in6_dev_atomic_dec(struct inet6_dev *idev, const char *func, int line) +{ + int refcnt; + + BUG_ON(atomic_read(&idev->refcnt) == 0); + if ((unlikely(ip6_ref_debug)) && (strcmp(idev->dev->name, "rddVR0") == 0)) { + refcnt = atomic_read(&idev->refcnt); + printk(KERN_DEBUG "%s: in6_dev_atomic_dec %d->%d idev: %p %s:%d\n", + idev->dev->name, refcnt, refcnt + 1, idev, func, line); + } + atomic_dec(&idev->refcnt); +} +EXPORT_SYMBOL(in6_dev_atomic_dec); + +int in6_dev_atomic_dec_and_test(struct inet6_dev *idev, const char *func, int line) +{ + int refcnt; + + BUG_ON(atomic_read(&idev->refcnt) == 0); + if ((unlikely(ip6_ref_debug)) && (strcmp(idev->dev->name, "rddVR0") == 0)) { + refcnt = atomic_read(&idev->refcnt); + printk(KERN_DEBUG "%s: in6_dev_atomic_dec_and_test %d->%d idev: %p %s:%d\n", + idev->dev->name, refcnt, refcnt - 1, idev, func, line); + } + return atomic_dec_and_test(&idev->refcnt); +} +EXPORT_SYMBOL(in6_dev_atomic_dec_and_test); + +void in6_dev_atomic_inc(struct inet6_dev *idev, const char *func, int line) +{ + int refcnt; + + if ((unlikely(ip6_ref_debug)) && (strcmp(idev->dev->name, "rddVR0") == 0)) { + refcnt = atomic_read(&idev->refcnt); + printk(KERN_DEBUG "%s: in6_dev_atomic_inc %d->%d idev: %p %s:%d\n", + idev->dev->name, refcnt, refcnt + 1, idev, func, line); + } + atomic_inc(&idev->refcnt); +} +EXPORT_SYMBOL(in6_dev_atomic_inc); + +void in6_ifa_atomic_dec(struct inet6_ifaddr *ifa, const char *func, int line) +{ + int refcnt; + + BUG_ON(atomic_read(&ifa->refcnt) == 0); + if ((unlikely(ip6_ref_debug)) && (strcmp(ifa->idev->dev->name, "rddVR0") == 0)) { + refcnt = atomic_read(&ifa->refcnt); + printk(KERN_DEBUG "%s: in6_ifa_atomic_dec %d-%d ifa: %p %s:%d\n", + ifa->idev->dev->name, refcnt, refcnt - 1, ifa, func, line); + } + atomic_dec(&ifa->refcnt); +} +EXPORT_SYMBOL(in6_ifa_atomic_dec); + +int in6_ifa_atomic_dec_and_test(struct inet6_ifaddr *ifa, const char *func, int line) +{ + int refcnt; + + BUG_ON(atomic_read(&ifa->refcnt) == 0); + if ((unlikely(ip6_ref_debug)) && (strcmp(ifa->idev->dev->name, "rddVR0") == 0)) { + refcnt = atomic_read(&ifa->refcnt); + printk(KERN_DEBUG "%s: in6_ifa_atomic_dec_and_test %d->%d ifa: %p %s:%d\n", + ifa->idev->dev->name, refcnt, refcnt - 1, ifa, func, line); + } + return atomic_dec_and_test(&ifa->refcnt); +} +EXPORT_SYMBOL(in6_ifa_atomic_dec_and_test); + +void in6_ifa_atomic_inc(struct inet6_ifaddr *ifa, const char *func, int line) +{ + int refcnt; + + if ((unlikely(ip6_ref_debug)) && (strcmp(ifa->idev->dev->name, "rddVR0") == 0)) { + refcnt = atomic_read(&ifa->refcnt); + printk(KERN_DEBUG "%s: in6_ifa_atomic_inc %d->%d ifa: %p %s:%d\n", + ifa->idev->dev->name, refcnt, refcnt + 1, ifa, func, line); + } + atomic_inc(&ifa->refcnt); +} +EXPORT_SYMBOL(in6_ifa_atomic_inc); diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 695c0ca..f50c8c7 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -256,7 +256,7 @@ static int __init fib6_default_rules_init(void) { int err; - err = fib_default_rule_add(&fib6_rules_ops, 0, + err = fib_default_rule_add(&fib6_rules_ops, 256, RT6_TABLE_LOCAL, FIB_RULE_PERMANENT); if (err < 0) return err; diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 3717bdf..633b7bc 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -56,9 +56,10 @@ static int ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, static int ipv6_print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple) { - return seq_printf(s, "src=" NIP6_FMT " dst=" NIP6_FMT " ", + return seq_printf(s, "src=" NIP6_FMT " dst=" NIP6_FMT " mark=%u ", NIP6(*((struct in6_addr *)tuple->src.u3.ip6)), - NIP6(*((struct in6_addr *)tuple->dst.u3.ip6))); + NIP6(*((struct in6_addr *)tuple->dst.u3.ip6)), + tuple->mark); } /* diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 8785784..18f8627 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -82,7 +82,7 @@ static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, */ n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); h = jhash2((u32 *)tuple, n, - rnd ^ (((__force __u16)tuple->dst.u.all << 16) | + tuple->mark ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) | tuple->dst.protonum)); return ((u64)h * size) >> 32; @@ -112,6 +112,7 @@ nf_ct_get_tuple(const struct sk_buff *skb, tuple->dst.protonum = protonum; tuple->dst.dir = IP_CT_DIR_ORIGINAL; + tuple->mark = skb->mark; return l4proto->pkt_to_tuple(skb, dataoff, tuple); } @@ -160,8 +161,8 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, return 0; inverse->dst.dir = !orig->dst.dir; - inverse->dst.protonum = orig->dst.protonum; + inverse->mark = orig->mark; return l4proto->invert_tuple(inverse, orig); } EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index d15d70f..17bb61c 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -222,6 +222,7 @@ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) return NULL; new->master = me; + new->tuple.mark = me->tuplehash[IP_CT_DIR_ORIGINAL].tuple.mark; atomic_set(&new->use, 1); INIT_RCU_HEAD(&new->rcu); return new; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b8b827c..fd81705 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -79,6 +79,7 @@ #include #include #include +#include #ifdef CONFIG_INET #include @@ -317,7 +318,14 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, struct net_device *dev; __be16 proto=0; int err; + int kludge = 0; +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + if (sk->sk_flags & SOCK_DONT_DO_LL_FCS) { + kludge = 4; // We're doing our own CRC + } +#endif + /* * Get and verify the address. */ @@ -352,7 +360,7 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, */ err = -EMSGSIZE; - if (len > dev->mtu + dev->hard_header_len) + if (len > (dev->mtu + dev->hard_header_len + kludge)) goto out_unlock; err = -ENOBUFS; @@ -394,6 +402,16 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, if (err) goto out_free; +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + if (sk->sk_flags & SOCK_DONT_DO_LL_FCS) { + skb->use_specified_ether_crc = 1; + } + else { + skb->use_specified_ether_crc = 0; + } +#endif + + /* * Now send it */ @@ -705,6 +723,13 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, __be16 proto; unsigned char *addr; int ifindex, err, reserve = 0; + int kludge = 0; + +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + if (sk->sk_flags & SOCK_DONT_DO_LL_FCS) { + kludge = 4; // We're doing our own CRC + } +#endif /* * Get and verify the address. @@ -740,7 +765,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, goto out_unlock; err = -EMSGSIZE; - if (len > dev->mtu+reserve) + if (len > (dev->mtu + reserve + kludge)) goto out_unlock; skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev), @@ -765,6 +790,15 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, skb->dev = dev; skb->priority = sk->sk_priority; +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + if (sk->sk_flags & SOCK_DONT_DO_LL_FCS) { + skb->use_specified_ether_crc = 1; + } + else { + skb->use_specified_ether_crc = 0; + } +#endif + /* * Now send it */ diff --git a/net/redir/Kconfig b/net/redir/Kconfig new file mode 100644 index 0000000..3abfbe1 --- /dev/null +++ b/net/redir/Kconfig @@ -0,0 +1,7 @@ +config REDIRDEV + tristate "Redirect-net-device support" + depends on EXPERIMENTAL + ---help--- + This allows one to create virtual interfaces that effectively + swap tx for rx, allowing one to create bridges and similar + constructs all in the same machine. diff --git a/net/redir/Makefile b/net/redir/Makefile new file mode 100644 index 0000000..70d4dcb --- /dev/null +++ b/net/redir/Makefile @@ -0,0 +1,10 @@ +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +obj-$(CONFIG_REDIRDEV) := redirdev.o + + diff --git a/net/redir/redirdev.c b/net/redir/redirdev.c new file mode 100644 index 0000000..3b8d5e2 --- /dev/null +++ b/net/redir/redirdev.c @@ -0,0 +1,976 @@ +/* -*- linux-c -*- +####################################################################### +# +# (C) Copyright 2005 +# Ben Greear +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of +# the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307 USA +####################################################################### +# Notes: +# +# This file implements the Redirect-net-device module. A pair of +# redir devices linked to each other act like two ethernet interfaces +# connected with a cross-over cable. +# +# This provides an IOCTL interface which allows you to +# It uses an IOCTL interface which allows you to +# +# 1. create redirect device +# 2. delete redirect device +# +####################################################################### +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef CONFIG_PROC_FS +#include +#define RDD_PROC_DIR "redirdev" +#define RDD_PROC_CFG "config" +static struct proc_dir_entry *rdd_proc_dir; +static struct proc_dir_entry *rdd_proc_cfg; +#endif + +#include "redirdev.h" + +/* Defined in socket.c */ +void redirdev_ioctl_set(int (*hook)(void*)); +static int redirdev_device_event(struct notifier_block *unused, + unsigned long event, void *ptr); + +static struct notifier_block redirdev_notifier_block = { + .notifier_call = redirdev_device_event, +}; + +/*********************************************************/ +/* defines */ +/*********************************************************/ + +/* Must hold this lock to make any changes to the Redirect-Device structures. + */ +static spinlock_t rdd_cfg_lock = SPIN_LOCK_UNLOCKED; + +static int do_quotas = 1; /* global enable/disable for quota logic. */ + + +/*********************************************************/ +/* file scope variables */ +/*********************************************************/ + +static struct redirdev* rdds = NULL; + +static atomic_t rdd_dev_counter; + +static int debug_lvl = 0; + + +/*********************************************************/ +/* forward declarations */ +/*********************************************************/ + +#ifdef RDD_CONFIG_PROC_FS +static int read_rdd_glbl(char *page, char **start, off_t off, + int count, int *eof, void *data); +static int write_rdd_glbl(struct file *file, const char *buffer, + unsigned long count, void *data); +#endif + + + +/*********************************************************/ +/* function definitions */ +/*********************************************************/ + + +#define iswhitespace(x)\ + ((x) == ' ' || (x) == '\n' || (x) == '\r' || (x) == '\r' ) + +#define skip_whitespace(x) { while (iswhitespace(*x)) (x)++; } + +static int copy_next_word(char *dst, char *src, int len) { + char *p; + for (p=src; p < src + len ; p++) { + if ( iswhitespace(*p)) + break; + *dst++ = *p; + } + return p - src; +} + +/* Grab the RDD lock before calling this method. */ +struct redirdev* rdd_find_dev_by_name(const char* ifname) { + struct redirdev* d; + //printk("finding port for underlying ifname: %s\n", ifname); + for (d = rdds; d; d = d->next) { + //printk("Testing port: %p name: %s\n", port, port->dev->name); + if (strcmp(d->dev->name, ifname) == 0) { + break; + } + } + //printk("done finding port: %p\n", port); + return d; +} + +/* Grab the RDD lock before calling this method. */ +struct redirdev* rdd_find_dev_by_txdev_name(const char* ifname) { + struct redirdev* d; + for (d = rdds; d; d = d->next) { + if (d->tx_dev) { + if (strcmp(d->tx_dev->name, ifname) == 0) { + break; + } + } + } + return d; +} + + +static struct net_device_stats *redirdev_get_stats(struct net_device *dev) +{ + struct redirdev* rdd = dev->priv; + + return &rdd->statistics; +} + +/** Bump our tx counters and then act as if this was received from + * the network on the tx_dev device. Since we don't do any CSUM + * activity in this driver, make sure SKB as marked as not checksummed + * yet. + */ +static int redirdev_xmit(struct sk_buff *skb, struct net_device *dev) { + struct redirdev* rdd = dev->priv; + struct net_device_stats* txs; + + if (unlikely((!rdd->tx_dev) && rdd->wants_to_run)) { + rdd->tx_dev = dev_get_by_name(&init_net, rdd->tx_dev_name); + if (rdd->tx_dev) { + printk("redir: Associated tx_dev_name: %s with device: %p in redirdev_xmit\n", + rdd->tx_dev_name, rdd->tx_dev); + } + } + + if (unlikely(!rdd->tx_dev)) { + printk("ERROR: tx_dev null in redirdev_xmit.\n"); + kfree_skb(skb); + rdd->statistics.tx_errors++; + goto out; + } + + skb_orphan(skb); /* release this skb from the sending socket's accounting. */ + + //printk("%s: dev: %s tx_dev: %s\n", + // __PRETTY_FUNCTION__, dev->name, rdd->tx_dev->name); + + if (netif_running(rdd->tx_dev)) { + + /* We need to free the old skb so that the socket + * account works correctly. We'll make a copy and + * then forward that to the other device. + */ + + int rv; + skb->dev = rdd->tx_dev; + + /* We didn't calculate the csum, so mark as such. */ + skb->ip_summed = CHECKSUM_UNNECESSARY;//NONE; + + rdd->statistics.tx_packets++; + rdd->statistics.tx_bytes += skb->len; + + txs = rdd->tx_dev->get_stats(rdd->tx_dev); + txs->rx_packets++; + txs->rx_bytes += skb->len; + + /* Zero out the time-stamp so that receiving code is forced + * to recalculate it. + */ + skb->tstamp.tv64 = 0; + + /* Call this on the receiving net device. This assumes + * that all devices are ethernet or ethernet-like. Valid + * for now. TODO: Generalize tx_dev ?? + */ + skb->pkt_type = PACKET_HOST; //Reset this to default. + + skb->protocol = eth_type_trans(skb, skb->dev); + + if (skb->dst) { + dst_release(skb->dst); + skb->dst = NULL; + } + + /* Remove any connection tracking info */ + nf_reset(skb); + /* Clear skb->mark */ + skb->mark = 0; + + rdd->dev->trans_start = jiffies; + + //printk("skb->protocol: %x pkt_type: %u\n", + // (unsigned int)(skb->protocol), + // (unsigned int)(skb->pkt_type)); + rv = netif_rx(skb); + if (rv != NET_RX_SUCCESS) { + // TODO: Remove + //printk("netif_rx rv: %i\n", (int)(rv)); + } + + rdd->tx_dev->last_rx = jiffies; + } + else { + /* Chunk the packet and log some errors */ + rdd->statistics.tx_errors++; + kfree_skb(skb); + } + +out: + /* -1 means ignore, and -2 means ignore sets as well. This is to + * disable logic that uses this code w/out the calling code knowing + * Used for debugging. + */ + if (do_quotas && (rdd->quota != 0xFFFFFFFF)) { + if (rdd->quota > 0) { + rdd->quota--; + } + if (rdd->quota == 0) { + // Stop the tx-queue + netif_stop_queue(dev); + } + } + + return 0; +}/* redir xmit */ + +static int redirdev_open(struct net_device *dev) { + struct redirdev* rdd = dev->priv; + rdd->wants_to_run = 1; + if (!rdd->tx_dev) { + rdd->tx_dev = dev_get_by_name(&init_net, rdd->tx_dev_name); + } + if (!rdd->tx_dev) { + printk("redir: %s Warning: Could not find tx_dev: %s, will try later in redirdev_xmit.\n", + dev->name, rdd->tx_dev_name); + } + + printk("redirdev: Starting device: %s\n", dev->name); + netif_start_queue(dev); + return 0; +} + +//static void redirdev_set_multicast_list(struct net_device *dev) { + /* TODO ??? */ +//} + +static int redirdev_stop(struct net_device *dev) { + struct redirdev* rdd = dev->priv; + printk("redirdev: stopping device: %s\n", dev->name); + netif_stop_queue(dev); + rdd->wants_to_run = 0; + if (rdd->tx_dev) { + struct net_device* tmp = rdd->tx_dev; + rdd->tx_dev = NULL; + printk(" releasing reference to dev: %s\n", tmp->name); + dev_put(tmp); + } + printk(" done stopping %s\n", dev->name); + return 0; +} + + +void redirdev_dev_destructor(struct net_device *dev) { + atomic_dec(&rdd_dev_counter); + if (dev->priv) { + //printk("dst: %s", dev->name); + kfree(dev->priv); + dev->priv = NULL; + } + else { + //printk("dst2: %s", dev->name); + } +} + +int redirdev_change_mtu(struct net_device *dev, int new_mtu) { + dev->mtu = new_mtu; + return 0; +} + +void rdd_setup(struct net_device* dev) { + ether_setup(dev); + + dev->get_stats = redirdev_get_stats; + dev->hard_start_xmit = redirdev_xmit; + dev->change_mtu = redirdev_change_mtu; + dev->open = redirdev_open; + dev->stop = redirdev_stop; + dev->destructor = redirdev_dev_destructor; + + dev->dev_addr[0] = 0; + dev->dev_addr[1] = net_random(); + dev->dev_addr[2] = net_random(); + dev->dev_addr[3] = net_random(); + dev->dev_addr[4] = net_random(); + dev->dev_addr[5] = net_random(); +} + +static int redirdev_create(const char* newifname, + const char* txdevname) { + struct redirdev *rdd = NULL; + struct net_device* td = NULL; + struct net_device* nnd = NULL; + struct net_device* txd = NULL; + unsigned long flags; + int rv; + + if ((strlen(txdevname) == 0) || + (strlen(newifname) == 0)) { + printk("redirdev: ERROR: Must specify ifname and txifname" + " when creating redirect devices!\n"); + rv = -ENODEV; + goto out; + } + + printk("redirdev: creating interface: -:%s:- with tx_dev: -:%s:-\n", + newifname, txdevname); + + + //printk("malloc "); + if ((rdd = kmalloc(sizeof(*rdd), GFP_KERNEL)) == NULL) { + //printk("redirdev: kmalloc failure\n"); + rv = -ENOMEM; + goto outfree; + } + memset(rdd, 0, sizeof(*rdd)); + rdd->quota = 0xFFFFFFFF; // Default to not use quota. + + if ((td = dev_get_by_name(&init_net, newifname)) != NULL) { + //printk("redirdev: device by that name already exists\n"); + rv = -EEXIST; + goto outfree; + } + + //printk("4 "); + nnd = alloc_netdev(0, newifname, rdd_setup); + if (nnd == NULL) { + //printk("redirdev: kmalloc net_device failure\n"); + rv = -ENOMEM; + goto outfree; + } + + /* If it's not here yet, no problem, will associate later */ + txd = dev_get_by_name(&init_net, txdevname); + strncpy(rdd->tx_dev_name, txdevname, IFNAMSIZ); + + //printk("4 "); + rdd->dev = nnd; + + //printk("5 "); + + dev_hold(rdd->dev); /* RDD code holds reference */ + + rdd->dev->priv = rdd; + rdd->tx_dev = txd; + + + // Defaults are fine for these + //rdd->dev->rebuild_header = redirdev_dev_rebuild_header; + //rdd->dev->set_multicast_list = redirdev_set_multicast_list; + //rdd->dev->hard_header = redirdev_hard_header; + + /* No qdisc for us */ + rdd->dev->qdisc = NULL; + rdd->dev->tx_queue_len = 0; + + //printk("redirdev: created redirect-device %p\n", vlan); + + /* link to list */ + //printk("8 "); + spin_lock_irqsave(&rdd_cfg_lock, flags); + rdd->next = rdds; + rdds = rdd; + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + //printk("End of redirdev_create, registering rdd->dev: %p (%s)\n", + // rdd->dev, rdd->dev->name); + + register_netdev(rdd->dev); + + //printk("End of mac_vlan create2\n"); + + atomic_inc(&rdd_dev_counter); + //printk("9\n"); + rv = 0; + goto out; + + /* Error case, clean up vlan memory */ + outfree: + if (rdd) { + kfree(rdd); + } + if (nnd) { + kfree(nnd); + } + if (td) { + dev_put(td); + } + if (txd) { + dev_put(txd); + } + out: + return rv; +} /* redirdev_create */ + +static int redirdev_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) { + struct net_device* dev = ptr; + struct redirdev* rdd; + unsigned long flags; + + spin_lock_irqsave(&rdd_cfg_lock, flags); + rdd = rdd_find_dev_by_txdev_name(dev->name); + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + if (!rdd) { + //printk("redirdev: Ignoring event: %lu for device: %s\n", + // event, dev->name); + goto out; + } + + + /* It is OK that we do not hold the group lock right now, + * as we run under the RTNL lock. + */ + + switch (event) { + case NETDEV_CHANGE: + case NETDEV_DOWN: + //printk("redirdev: Ignoring change/up/down for device: %s\n", + // dev->name); + /* Ignore for now */ + break; + + case NETDEV_UP: + /* Start the redir-dev too if it wants to run */ + if ((!netif_running(rdd->dev)) && rdd->wants_to_run) { + printk("Device: %s is up, starting redir-device: %s too.\n", + dev->name, rdd->dev->name); + dev_open(rdd->dev); + } + break; + + case NETDEV_UNREGISTER: + /* Stop the redir-dev too */ + printk("Device: %s is going away, closing redir-device: %s too.\n", + dev->name, rdd->dev->name); + if (rdd->dev->flags & IFF_UP) { + /* Graceful shutdown, drop links to our peer. */ + dev_close(rdd->dev); + } + else { + /* Still drop links to peer...but dev_close would not have done anything. */ + redirdev_stop(rdd->dev); + } + rdd->wants_to_run = 1; /* was forced down. */ + break; + + }; + +out: + return NOTIFY_DONE; +} + +/* Has locking internally */ +int redirdev_cleanup(const char* ifname, int force) { + struct redirdev* d; //walker + struct redirdev* prev = NULL; + unsigned long flags; + int rv; + + //printk(__FUNCTION__"(%p)\n",vlan); + //printk("rdd_cln: %s", ifname); + + spin_lock_irqsave(&rdd_cfg_lock, flags); + for (d = rdds; d; d = d->next) { + if (strcmp(d->dev->name, ifname) == 0) { + if ((d->dev->flags & IFF_UP) && (!force)) { + rv = -EBUSY; + goto unlockout; + } + + // Un-link from the list. + if (prev) { + prev->next = d->next; + d->next = NULL; + } + else { + // This means we're first in line + rdds = d->next; + d->next = NULL; + } + + break; + } + prev = d; + } + + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + if (d) { + if (d->dev->flags & IFF_UP) { + BUG_ON(!force); + + rtnl_lock(); + dev_close(d->dev); + rtnl_unlock(); + } + + if (d->tx_dev) { + dev_put(d->tx_dev); + } + + dev_put(d->dev); + unregister_netdev(d->dev); + rv = 0; + } + else { + rv = -ENODEV; + } + goto out; + + unlockout: + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + out: + return rv; +} /* redirdev cleanup */ + + +static int redirdev_ioctl_deviceless_stub(void* arg) { + int err = 0; + struct redirdev_ioctl req; + unsigned long flags; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&req, arg, sizeof(req))) + return -EFAULT; + + switch (req.cmd) { + case REDIRDEV_ADD: { + /* + * create a new redirect device + */ + req.txifname[IFNAMSIZ-1] = '\0'; + req.ifname[IFNAMSIZ-1] = '\0'; + printk("Creating redir via ioctl, ifname: %s txifname: %s\n", + req.ifname, req.txifname); + + /* Has internal locking. */ + err = redirdev_create(req.ifname, req.txifname); + break; + } + case REDIRDEV_DEL: { + /* + * destroy a redirect device + */ + req.ifname[IFNAMSIZ-1] = '\0'; + + /* Has internal locking */ + err = redirdev_cleanup(req.ifname, 0); + break; + } + + case REDIRDEV_IS_REDIRDEV: { + /* + * Give user-space a chance of determining if we are a redirect-device + * or not. + * (If the IOCTL fails, we are not, otherwise we are.) + */ + struct redirdev* rdd; + req.ifname[IFNAMSIZ-1] = '\0'; + + spin_lock_irqsave(&rdd_cfg_lock, flags); + /* find the port in question */ + rdd = rdd_find_dev_by_name(req.ifname); + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + if (!rdd) { + /* printk("device: %s is NOT a REDIR device\n", ifname); */ + err = -ENODEV; + } + else { + /* printk("device: %s IS a MAC-VLAN\n", ifname); */ + err = 0; + } + break; + } + case REDIRDEV_SET_QUOTA: { + /* + * Set the quota. 0xFFFFFFFF means disable quota logic. + * (If the IOCTL fails, we are not, otherwise we are.) + */ + struct redirdev* rdd; + struct net_device* dev; + + // Get device by idx; + dev = dev_get_by_index(&init_net, req.ifidx); + if (dev) { + if (dev->get_stats == redirdev_get_stats) { + rdd = dev->priv; + rdd->quota = req.flags; + netif_wake_queue(dev); + } + else { + err = -EINVAL; + } + dev_put(dev); + } + else { + err = -ENODEV; + } + break; + } + case REDIRDEV_GET_BY_IDX: { + /* + * get the nth redirdev name + */ + struct redirdev *rdd; + int n = req.ifidx; + + spin_lock_irqsave(&rdd_cfg_lock, flags); + /* find the port in question */ + for (rdd = rdds; rdd && n; rdd = rdd->next, n--); + if (!rdd) { + err = -ENODEV; + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + } + else { + memcpy(req.ifname, rdd->dev->name, IFNAMSIZ); + memcpy(req.txifname, rdd->tx_dev_name, IFNAMSIZ); + if (rdd->tx_dev) { + req.flags |= RDD_ASSOCIATED; + } + else { + req.flags &= ~RDD_ASSOCIATED; + } + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + if (copy_to_user(arg, &req, sizeof(req))) { + err = -EFAULT; + } + } + break; + } + case REDIRDEV_GET_BY_NAME: { + /* + * get info on the specified redirect device + */ + struct redirdev *rdd; + req.ifname[IFNAMSIZ-1] = '\0'; + + spin_lock_irqsave(&rdd_cfg_lock, flags); + /* find the port in question */ + rdd = rdd_find_dev_by_name(req.ifname); + + if (!rdd) { + err = -ENODEV; + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + } + else { + memcpy(req.ifname, rdd->dev->name, IFNAMSIZ); + memcpy(req.txifname, rdd->tx_dev_name, IFNAMSIZ); + if (rdd->tx_dev) { + req.flags |= RDD_ASSOCIATED; + } + else { + req.flags &= ~RDD_ASSOCIATED; + } + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + if (copy_to_user(arg, &req, sizeof(req))) { + err = -EFAULT; + } + } + break; + } + default: + printk("ERROR: Un-supported redirdev ioctl command: %u\n", + (unsigned int)(req.cmd)); + send_sig(SIGSEGV, current, 1); // TODO: Remove + err = -EOPNOTSUPP; + break; + }//switch + + /* printk("Returning err: %i\n", err); */ + return err; +}/* ioctl handler */ + + +#ifdef RDD_CONFIG_PROC_FS + +static int read_rdd_glbl(char *page, char **start, off_t off, + int count, int *eof, void *data) { + int ret = -1; + char *p = page; + int mx_len = (4096 - (p - page)); + + if (! *eof ) { + struct redirdev* rdd; + int cnt; + unsigned long flags; + + /* Global counts here... */ + p += sprintf(p, "Redirect-devices: %i quotas-enabled: %i\ndev\ttx-dev\tquota\n", + atomic_read(&rdd_dev_counter), do_quotas); + + spin_lock_irqsave(&rdd_cfg_lock, flags); + rdd = rdds; + while (rdd) { + if (rdd->tx_dev) { + p += sprintf(p, "%s\t%s\t%i\n", + rdd->dev->name, rdd->tx_dev->name, rdd->quota); + } + else { + p += sprintf(p, " %s\t[%s]\t%i\n", + rdd->dev->name, rdd->tx_dev_name, rdd->quota); + } + + /* catch overflow */ + cnt = p - page; + if (cnt > (mx_len - 60)) { + if (mx_len - cnt >= 20) { + p += sprintf(p, "OUT_OF_SPACE!\n"); + } + break; + } + + rdd = rdd->next; + } + + ret = p - page; + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + } + return ret; +} /* read_rdd_glbl */ + +static int write_rdd_glbl(struct file *file, const char *buffer, + unsigned long count, void *data) { + char *p; + const char *end; + int ret=count; + int len; + char dev_name[2][IFNAMSIZ]; + char* tmps = NULL; + int tmp_rv; + char ss[50]; + end = buffer + count; + + snprintf(ss, 50, "redir proc cmd: %%.%lus", count); + + printk(ss, buffer); + + for (p= (char *) buffer; p< end ; ) { + if (iswhitespace(*p)) { + p++; + continue; + } + + memset(dev_name[0], 0 ,IFNAMSIZ); + memset(dev_name[1], 0 ,IFNAMSIZ); + + len = strlen("add_rdd "); + if (strncmp(p, "add_rdd ", len)==0) + { + p += len; + + if ( (p + IFNAMSIZ) <= end) + p += copy_next_word(dev_name[0], p, IFNAMSIZ); + else + p += copy_next_word(dev_name[0], p, end-p ); + + skip_whitespace(p); + + if ( (p + IFNAMSIZ) <= end) + p += copy_next_word(dev_name[1], p, IFNAMSIZ); + else + p += copy_next_word(dev_name[1], p, end-p ); + + skip_whitespace(p); + + /* This can fail, but not sure how to return failure + * to user-space here. + * NOTE: Does it's own internal locking. + */ + redirdev_create(dev_name[0], dev_name[1]); + goto forend; + } + + len = strlen("remove_rdd "); + if (strncmp(p,"remove_rdd ", len)==0) { + p += len; + + if ( (p + IFNAMSIZ) <= end) + p += copy_next_word(dev_name[0], p, IFNAMSIZ); + else + p += copy_next_word(dev_name[0], p, end-p ); + + skip_whitespace(p); + + tmp_rv = redirdev_cleanup(dev_name[0], 0); + if (tmp_rv < 0) { + printk("redirdev: ERROR: Failed redirdev_cleanup, error: %d\n", tmp_rv); + } + + goto forend; + } + + len = strlen("debug_lvl "); + if (strncmp(p,"debug_lvl ",len)==0) + { + p += len; + + if ( (p + IFNAMSIZ) <= end) + p += copy_next_word(dev_name[0], p, IFNAMSIZ); + else + p += copy_next_word(dev_name[0], p, end-p ); + + skip_whitespace(p); + + debug_lvl = simple_strtoul(dev_name[0], &tmps, 10); + goto forend; + } + + len = strlen("do_quotas "); + if (strncmp(p,"do_quotas ",len)==0) { + p += len; + + if ( (p + IFNAMSIZ) <= end) + p += copy_next_word(dev_name[0], p, IFNAMSIZ); + else + p += copy_next_word(dev_name[0], p, end-p ); + + skip_whitespace(p); + + do_quotas = simple_strtoul(dev_name[0], &tmps, 10); + goto forend; + } + + printk("ERROR: Unsupported command\n"); + + forend: + p++; + } + + return ret; +} /* write_rdd_glbl */ + +#endif + + +static int __init redirdev_init(void) { + int err; + printk(KERN_INFO "Redirect-Network-Device: 1.0 \n"); + + rdds = NULL; + + redirdev_ioctl_set(redirdev_ioctl_deviceless_stub); + +#ifdef RDD_CONFIG_PROC_FS + + rdd_proc_dir = proc_mkdir(RDD_PROC_DIR, init_net.proc_net); + if (rdd_proc_dir) { + rdd_proc_cfg = create_proc_read_entry(RDD_PROC_CFG, S_IRUGO, rdd_proc_dir, + read_rdd_glbl, NULL); + if (rdd_proc_cfg) { + rdd_proc_cfg->write_proc = write_rdd_glbl; + rdd_proc_cfg->owner = THIS_MODULE; + } + } +#endif + + /* Register us to receive netdevice events */ + err = register_netdevice_notifier(&redirdev_notifier_block); + if (err < 0) { + printk("ERROR: redirdev: Failed to register netdevice notifier callback!\n"); + } + + return 0; +} + +static void redirdev_module_cleanup(void) { + char nm[IFNAMSIZ+1]; + unsigned long flags; + + redirdev_ioctl_set(NULL); + + spin_lock_irqsave(&rdd_cfg_lock, flags); + /* destroy all redirect devices */ + while (rdds) { + strncpy(nm, rdds->dev->name, IFNAMSIZ); + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + if (redirdev_cleanup(nm, 1) < 0) { + printk("redirdev: ERROR: Failed redir_cleanup in redir_module_cleanup\n"); + + } + spin_lock_irqsave(&rdd_cfg_lock, flags); + } + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + /* Un-register us from receiving netdevice events */ + unregister_netdevice_notifier(&redirdev_notifier_block); + +#ifdef RDD_CONFIG_PROC_FS + if (rdd_proc_cfg) { + remove_proc_entry(RDD_PROC_CFG, rdd_proc_dir); + rdd_proc_cfg = NULL; + } + if (rdd_proc_dir) { + proc_net_remove(&init_net, RDD_PROC_DIR); + rdd_proc_dir = NULL; + } +#endif + +}/* redirdev_cleanup */ + + +module_init(redirdev_init); +module_exit(redirdev_module_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/redir/redirdev.h b/net/redir/redirdev.h new file mode 100644 index 0000000..223a00d --- /dev/null +++ b/net/redir/redirdev.h @@ -0,0 +1,41 @@ +/* -*- linux-c -*- + +# (C) Copyright 2005 +# Ben Greear +# Released under the GPL version 2 +*/ + +#ifndef REDIRDEV_KERNEL_H_FILE__ +#define REDIRDEV_KERNEL_H_FILE__ + + +/* Proc file related */ +#define RDD_MX_ARG_LEN 80 + +#ifdef CONFIG_PROC_FS + +/* To use or not to use the PROC-FS */ +#define RDD_CONFIG_PROC_FS + +#endif + + +/*********************************************************/ +/* types */ +/*********************************************************/ +struct redirdev { + /* Can be NULL if not yet associated */ + struct net_device* tx_dev; /* Call rx on this device when a packet + * is _transmitted_ on this redirect + * device. + */ + struct net_device* dev; /* the device struct this belongs too */ + struct redirdev *next; + char tx_dev_name[IFNAMSIZ]; + struct net_device_stats statistics; + int wants_to_run; /* Should we be running if we can? */ + u32 quota; /* Used for crude rate limitation. 0xFFFFFFFF means run forever */ +}; + +#endif + diff --git a/net/socket.c b/net/socket.c index 9d3fbfb..f5974d1 100644 --- a/net/socket.c +++ b/net/socket.c @@ -830,6 +830,30 @@ void vlan_ioctl_set(int (*hook) (struct net *, void __user *)) EXPORT_SYMBOL(vlan_ioctl_set); +static DEFINE_MUTEX(macvlan_ioctl_mutex); +static int (*macvlan_ioctl_hook)(void __user*); + +void macvlan_ioctl_set(int (*hook)(void __user*)) +{ + mutex_lock(&macvlan_ioctl_mutex); + macvlan_ioctl_hook = hook; + mutex_unlock(&macvlan_ioctl_mutex); +} +EXPORT_SYMBOL(macvlan_ioctl_set); + + +static DEFINE_MUTEX(redirdev_ioctl_mutex); +static int (*redirdev_ioctl_hook)(void __user*); + +void redirdev_ioctl_set(int (*hook)(void __user*)) +{ + mutex_lock(&redirdev_ioctl_mutex); + redirdev_ioctl_hook = hook; + mutex_unlock(&redirdev_ioctl_mutex); +} +EXPORT_SYMBOL(redirdev_ioctl_set); + + static DEFINE_MUTEX(dlci_ioctl_mutex); static int (*dlci_ioctl_hook) (unsigned int, void __user *); @@ -903,6 +927,28 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) err = vlan_ioctl_hook(net, argp); mutex_unlock(&vlan_ioctl_mutex); break; + case SIOCGIFMACVLAN: + case SIOCSIFMACVLAN: + err = -ENOPKG; + if (!macvlan_ioctl_hook) + request_module("macvlan"); + + mutex_lock(&macvlan_ioctl_mutex); + if (macvlan_ioctl_hook) + err = macvlan_ioctl_hook(argp); + mutex_unlock(&macvlan_ioctl_mutex); + break; + case SIOCGIFREDIRDEV: + case SIOCSIFREDIRDEV: + err = -ENOPKG; + if (!redirdev_ioctl_hook) + request_module("redirdev"); + + mutex_lock(&redirdev_ioctl_mutex); + if (redirdev_ioctl_hook) + err = redirdev_ioctl_hook(argp); + mutex_unlock(&redirdev_ioctl_mutex); + break; case SIOCADDDLCI: case SIOCDELDLCI: err = -ENOPKG;