diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index 7f6add1..750d0d8 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -120,6 +120,7 @@ unsigned long long native_sched_clock(void) /* return the value in ns */ return cycles_2_ns(this_offset); } +EXPORT_SYMBOL(sched_clock); /* We need to define a real function for sched_clock, to override the weak default version */ diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 4d0d1ac..b243bbd 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -440,5 +440,6 @@ static int time_init_device(void) error = sysdev_register(&device_timer); return error; } +EXPORT_SYMBOL(sched_clock); device_initcall(time_init_device); diff --git a/drivers/net/e100.c b/drivers/net/e100.c index 280313b..7c83f2d 100644 --- a/drivers/net/e100.c +++ b/drivers/net/e100.c @@ -1,4 +1,4 @@ -/******************************************************************************* +/************************************************************** Intel PRO/100 Linux driver Copyright(c) 1999 - 2006 Intel Corporation. @@ -386,6 +386,7 @@ enum cb_command { cb_ucode = 0x0005, cb_dump = 0x0006, cb_tx_sf = 0x0008, + cb_tx_nc = 0x0010, /* 0 == controler does CRC, ie normal. 1 == CRC from memory */ cb_cid = 0x1f00, cb_i = 0x2000, cb_s = 0x4000, @@ -422,7 +423,7 @@ struct config { /*5*/ u8 X(tx_dma_max_count:7, dma_max_count_enable:1); /*6*/ u8 X(X(X(X(X(X(X(late_scb_update:1, direct_rx_dma:1), tno_intr:1), cna_intr:1), standard_tcb:1), standard_stat_counter:1), - rx_discard_overruns:1), rx_save_bad_frames:1); + rx_save_overruns:1), rx_save_bad_frames:1); /*7*/ u8 X(X(X(X(X(rx_discard_short_frames:1, tx_underrun_retry:2), pad7:2), rx_extended_rfd:1), tx_two_frames_in_fifo:1), tx_dynamic_tbd:1); @@ -552,6 +553,8 @@ struct nic { multicast_all = (1 << 2), wol_magic = (1 << 3), ich_10h_workaround = (1 << 4), + accept_all_frames = (1 << 5), + save_fcs = (1 << 6), } flags ____cacheline_aligned; enum mac mac; @@ -1010,6 +1013,16 @@ static void e100_configure(struct nic *nic, struct cb *cb, struct sk_buff *skb) config->promiscuous_mode = 0x1; /* 1=on, 0=off */ } + if(nic->flags & accept_all_frames) { + config->rx_save_overruns = 0x1; /* 1=save, 0=discard */ + config->rx_save_bad_frames = 0x1; /* 1=save, 0=discard */ + config->rx_discard_short_frames = 0x0; /* 1=discard, 0=save */ + } + + if(nic->flags & save_fcs) { + config->rx_crc_transfer = 0x1; /* 1=save, 0=discard */ + } + if(nic->flags & multicast_all) config->multicast_all = 0x1; /* 1=accept, 0=no */ @@ -1470,6 +1483,16 @@ static void e100_set_multicast_list(struct net_device *netdev) else nic->flags &= ~promiscuous; + if(netdev->flags & IFF_ACCEPT_ALL_FRAMES) + nic->flags |= accept_all_frames; + else + nic->flags &= ~accept_all_frames; + + if(netdev->flags & IFF_SAVE_FCS) + nic->flags |= save_fcs; + else + nic->flags &= ~save_fcs; + if(netdev->flags & IFF_ALLMULTI || netdev->mc_count > E100_MAX_MULTICAST_ADDRS) nic->flags |= multicast_all; @@ -1611,6 +1634,19 @@ static void e100_xmit_prepare(struct nic *nic, struct cb *cb, struct sk_buff *skb) { cb->command = nic->tx_command; + +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + /* Use the last 4 bytes of the SKB payload packet as the CRC, used for + * testing, ie sending frames with bad CRC. + */ + if (unlikely(skb->use_specified_ether_crc)) { + cb->command |= __constant_cpu_to_le16(cb_tx_nc); + } + else { + cb->command &= ~__constant_cpu_to_le16(cb_tx_nc); + } +#endif + /* interrupt every 16 packets regardless of delay */ if((nic->cbs_avail & ~15) == nic->cbs_avail) cb->command |= cpu_to_le16(cb_i); @@ -1840,7 +1876,21 @@ static int e100_rx_indicate(struct nic *nic, struct rx *rx, skb_reserve(skb, sizeof(struct rfd)); skb_put(skb, actual_size); skb->protocol = eth_type_trans(skb, nic->netdev); - + /* NOTE: The config step turns on acceptance of various bogus frames + * when in loopback or promisc mode, but this code will still throw + * them away unless you also set the new 'accept_all_frames' flag. + * Perhaps the implementors meant to accept the bogus frames in + * promisc mode here?? --Ben + */ + if(unlikely(!(nic->flags & accept_all_frames))) { + if(actual_size > nic->netdev->mtu + VLAN_ETH_HLEN) { + /* Received oversized frame */ + nic->net_stats.rx_over_errors++; + } + /* We're accepting all, so pass the bogons on up the stack. */ + goto process_skb; + } + if(unlikely(!(rfd_status & cb_ok))) { /* Don't indicate if hardware indicates errors */ dev_kfree_skb_any(skb); @@ -1849,6 +1899,7 @@ static int e100_rx_indicate(struct nic *nic, struct rx *rx, nic->rx_over_length_errors++; dev_kfree_skb_any(skb); } else { + process_skb: nic->net_stats.rx_packets++; nic->net_stats.rx_bytes += actual_size; nic->netdev->last_rx = jiffies; @@ -2210,6 +2261,63 @@ static int e100_set_settings(struct net_device *netdev, struct ethtool_cmd *cmd) return err; } +static int e100_set_rxall(struct net_device *netdev, u32 data) +{ + struct nic *nic = netdev->priv; + if (data) { + netdev->priv_flags |= IFF_ACCEPT_ALL_FRAMES; + nic->flags |= accept_all_frames; + } + else { + netdev->priv_flags &= ~(IFF_ACCEPT_ALL_FRAMES); + nic->flags &= ~accept_all_frames; + } + + e100_exec_cb(nic, NULL, e100_configure); + + return 0; +} + +static int e100_get_rxall(struct net_device *netdev, u32* data) +{ + struct nic *nic = netdev->priv; + if (nic->flags & accept_all_frames) { + *data = 1; + } + else { + *data = 0; + } + + return 0; +} + +static int e100_set_save_fcs(struct net_device *netdev, u32 data) +{ + struct nic *nic = netdev->priv; + if (data) { + nic->flags |= save_fcs; + } + else { + nic->flags &= ~save_fcs; + } + e100_exec_cb(nic, NULL, e100_configure); + + return 0; +} + +static int e100_get_save_fcs(struct net_device *netdev, u32* data) +{ + struct nic *nic = netdev->priv; + if (nic->flags & save_fcs) { + *data = 1; + } + else { + *data = 0; + } + + return 0; +} + static void e100_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *info) { @@ -2506,6 +2614,10 @@ static const struct ethtool_ops e100_ethtool_ops = { .phys_id = e100_phys_id, .get_stats_count = e100_get_stats_count, .get_ethtool_stats = e100_get_ethtool_stats, + .set_rx_all = e100_set_rxall, + .get_rx_all = e100_get_rxall, + .set_save_fcs = e100_set_save_fcs, + .get_save_fcs = e100_get_save_fcs, }; static int e100_do_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) diff --git a/drivers/net/e1000/e1000.h b/drivers/net/e1000/e1000.h index 16a6edf..ef9b076 100644 --- a/drivers/net/e1000/e1000.h +++ b/drivers/net/e1000/e1000.h @@ -350,4 +350,7 @@ enum e1000_state_t { __E1000_DOWN }; + +void e1000_set_multi(struct net_device *netdev); + #endif /* _E1000_H_ */ diff --git a/drivers/net/e1000/e1000_ethtool.c b/drivers/net/e1000/e1000_ethtool.c index 9ecc3ad..6bbbb2d 100644 --- a/drivers/net/e1000/e1000_ethtool.c +++ b/drivers/net/e1000/e1000_ethtool.c @@ -1,4 +1,4 @@ -/******************************************************************************* +/***************************************************************** Intel PRO/1000 Linux driver Copyright(c) 1999 - 2006 Intel Corporation. @@ -1943,6 +1943,59 @@ e1000_get_strings(struct net_device *netdev, uint32_t stringset, uint8_t *data) } } +static int e1000_ethtool_setrxall(struct net_device *netdev, uint32_t val) { + unsigned short old_flags = netdev->priv_flags; + if (val) { + netdev->priv_flags |= IFF_ACCEPT_ALL_FRAMES; + } + else { + netdev->priv_flags &= ~(IFF_ACCEPT_ALL_FRAMES); + } + + /* printk("e1000_ethtool_setrxall (%s) val: %d\n", + netdev->name, val); */ + if (old_flags != netdev->priv_flags) { + netif_tx_lock_bh(netdev); + if (netif_running(netdev)) { + /*printk("Kicking e1000 for setrxall..\n");*/ + e1000_set_multi(netdev); + } else { + /* Value will be flushed into the hardware when the device is + * brought up. + */ + } + netif_tx_unlock_bh(netdev); + } + return 0; +} + +static int e1000_ethtool_set_save_fcs(struct net_device *netdev, uint32_t val) { + netif_tx_lock_bh(netdev); + if (val) { + netdev->priv_flags |= IFF_SAVE_FCS; + } + else { + netdev->priv_flags &= ~IFF_SAVE_FCS; + } + netif_tx_unlock_bh(netdev); + return 0; +} + +static int e1000_ethtool_get_save_fcs(struct net_device *netdev, uint32_t* val) { + *val = !!(netdev->priv_flags & IFF_SAVE_FCS); + /*printk("GETRXALL, data: %d priv_flags: %hx\n", + edata.data, netdev->priv_flags);*/ + return 0; +} + +static int e1000_ethtool_getrxall(struct net_device *netdev, uint32_t* val) { + *val = !!(netdev->priv_flags & IFF_ACCEPT_ALL_FRAMES); + /*printk("GETRXALL, data: %d priv_flags: %hx\n", + edata.data, netdev->priv_flags);*/ + return 0; +} + + static const struct ethtool_ops e1000_ethtool_ops = { .get_settings = e1000_get_settings, .set_settings = e1000_set_settings, @@ -1976,6 +2029,10 @@ static const struct ethtool_ops e1000_ethtool_ops = { .phys_id = e1000_phys_id, .get_stats_count = e1000_get_stats_count, .get_ethtool_stats = e1000_get_ethtool_stats, + .get_rx_all = e1000_ethtool_getrxall, + .set_rx_all = e1000_ethtool_setrxall, + .set_save_fcs = e1000_ethtool_set_save_fcs, + .get_save_fcs = e1000_ethtool_get_save_fcs, }; void e1000_set_ethtool_ops(struct net_device *netdev) diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c index e7c8951..55a0f66 100644 --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -1,4 +1,4 @@ -/******************************************************************************* +/***************************************************************** Intel PRO/1000 Linux driver Copyright(c) 1999 - 2006 Intel Corporation. @@ -153,7 +153,7 @@ static void e1000_clean_tx_ring(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring); static void e1000_clean_rx_ring(struct e1000_adapter *adapter, struct e1000_rx_ring *rx_ring); -static void e1000_set_multi(struct net_device *netdev); +void e1000_set_multi(struct net_device *netdev); static void e1000_update_phy_info(unsigned long data); static void e1000_watchdog(unsigned long data); static void e1000_82547_tx_fifo_stall(unsigned long data); @@ -992,6 +992,9 @@ e1000_probe(struct pci_dev *pdev, if (pci_using_dac) netdev->features |= NETIF_F_HIGHDMA; + /* Has ability to receive all frames (even bad CRCs and such) */ + netdev->features |= NETIF_F_RX_ALL | NETIF_F_SAVE_CRC; + netdev->features |= NETIF_F_LLTX; adapter->en_mng_pt = e1000_enable_mng_pass_thru(&adapter->hw); @@ -2426,7 +2429,7 @@ e1000_set_mac(struct net_device *netdev, void *p) * promiscuous mode, and all-multi behavior. **/ -static void +void e1000_set_multi(struct net_device *netdev) { struct e1000_adapter *adapter = netdev_priv(netdev); @@ -2461,6 +2464,35 @@ e1000_set_multi(struct net_device *netdev) E1000_WRITE_REG(hw, RCTL, rctl); + + /* This is useful for using ethereal or tcpdump to sniff + * packets in promiscuous mode without stripping VLAN/priority + * information, and also letting bad packets through. + * + * THIS IS NOT PRODUCTION CODE - FOR INTERNAL USE ONLY!!! + * + */ + if (netdev->priv_flags & IFF_ACCEPT_ALL_FRAMES) { + uint32_t ctrl; + /*printk("%s: Enabling acceptance of ALL frames (bad CRC too).\n", + netdev->name); */ + /* store bad packets, promisc/multicast all, no VLAN + * filter */ + rctl = E1000_READ_REG(hw, RCTL); + rctl |= (E1000_RCTL_SBP | E1000_RCTL_UPE | E1000_RCTL_MPE); + rctl &= ~(E1000_RCTL_VFE | E1000_RCTL_CFIEN); + E1000_WRITE_REG(hw, RCTL, rctl); + /* disable VLAN tagging/striping */ + ctrl = E1000_READ_REG(hw, CTRL); + ctrl &= ~E1000_CTRL_VME; + E1000_WRITE_REG(hw, CTRL, ctrl); + } + else { + /* TODO: Do we need a way to explicitly turn this off if it was + * previously enabled, or will it magically go back to normal??? --Ben + */ + } + /* 82542 2.0 needs to be in reset to write receive address registers */ if (hw->mac_type == e1000_82542_rev2_0) @@ -2877,6 +2909,7 @@ set_itr_now: #define E1000_TX_FLAGS_VLAN 0x00000002 #define E1000_TX_FLAGS_TSO 0x00000004 #define E1000_TX_FLAGS_IPV4 0x00000008 +#define E1000_TX_FLAGS_NO_FCS 0x00000010 #define E1000_TX_FLAGS_VLAN_MASK 0xffff0000 #define E1000_TX_FLAGS_VLAN_SHIFT 16 @@ -3127,6 +3160,13 @@ e1000_tx_queue(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring, txd_upper |= (tx_flags & E1000_TX_FLAGS_VLAN_MASK); } +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + if (unlikely(tx_flags & E1000_TX_FLAGS_NO_FCS)) { + txd_lower &= ~(E1000_TXD_CMD_IFCS); + /* printk("Disabling CRC in tx_queue, txd_lower: 0x%x\n", txd_lower); */ + } +#endif + i = tx_ring->next_to_use; while (count--) { @@ -3141,6 +3181,14 @@ e1000_tx_queue(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring, tx_desc->lower.data |= cpu_to_le32(adapter->txd_cmd); +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + /* txd_cmd re-enables FCS, so we'll re-disable it here as desired. */ + if (unlikely(tx_flags & E1000_TX_FLAGS_NO_FCS)) { + tx_desc->lower.data &= ~(cpu_to_le32(E1000_TXD_CMD_IFCS)); + /* printk("Disabling2 CRC in tx_queue, txd_lower: 0x%x\n", tx_desc->lower.data); */ + } +#endif + /* Force memory writes to complete before letting h/w * know there are new descriptors to fetch. (Only * applicable for weak-ordered memory model archs, @@ -3419,6 +3467,12 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev) if (likely(skb->protocol == htons(ETH_P_IP))) tx_flags |= E1000_TX_FLAGS_IPV4; +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + if (unlikely(skb->use_specified_ether_crc)) { + tx_flags |= E1000_TX_FLAGS_NO_FCS; + } +#endif + e1000_tx_queue(adapter, tx_ring, tx_flags, e1000_tx_map(adapter, tx_ring, skb, first, max_per_txd, nr_frags, mss)); @@ -4198,7 +4252,11 @@ e1000_clean_rx_irq(struct e1000_adapter *adapter, goto next_desc; } - if (unlikely(rx_desc->errors & E1000_RXD_ERR_FRAME_ERR_MASK)) { + /* If we are accepting all frames, then do not pay attention to the + * framing errors. + */ + if (unlikely(rx_desc->errors & E1000_RXD_ERR_FRAME_ERR_MASK) && + !(netdev->priv_flags & IFF_ACCEPT_ALL_FRAMES)) { last_byte = *(skb->data + length - 1); if (TBI_ACCEPT(&adapter->hw, status, rx_desc->errors, length, last_byte)) { @@ -4224,6 +4282,16 @@ e1000_clean_rx_irq(struct e1000_adapter *adapter, total_rx_bytes += length; total_rx_packets++; + + // This may not be needed now. --Ben + //if (netdev->priv_flags & IFF_SAVE_FCS) { + // skb_put(skb, length); + //} + //else { + // skb_put(skb, length - ETHERNET_FCS_SIZE); + //} + + /* code added for copybreak, this should improve * performance for small packets with large amounts * of reassembly being done in the stack */ @@ -4364,7 +4432,8 @@ e1000_clean_rx_irq_ps(struct e1000_adapter *adapter, goto next_desc; } - if (unlikely(staterr & E1000_RXDEXT_ERR_FRAME_ERR_MASK)) { + if ((unlikely(staterr & E1000_RXDEXT_ERR_FRAME_ERR_MASK)) && + !(netdev->priv_flags & IFF_ACCEPT_ALL_FRAMES)) { dev_kfree_skb_irq(skb); goto next_desc; } diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index dc74d00..5ad708e 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -38,13 +38,6 @@ struct macvlan_port { struct list_head vlans; }; -struct macvlan_dev { - struct net_device *dev; - struct list_head list; - struct hlist_node hlist; - struct macvlan_port *port; - struct net_device *lowerdev; -}; static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, @@ -417,7 +410,7 @@ static void macvlan_dellink(struct net_device *dev) unregister_netdevice(dev); if (list_empty(&port->vlans)) - macvlan_port_destroy(dev); + macvlan_port_destroy(port->dev); } static struct rtnl_link_ops macvlan_link_ops __read_mostly = { diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index b98742f..23f5e47 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -131,6 +131,7 @@ struct TCP_Server_Info { struct sockaddr_in sockAddr; struct sockaddr_in6 sockAddr6; } addr; + u32 ip4_local_ip; wait_queue_head_t response_q; wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/ struct list_head pending_mid_q; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4af3588..420e02a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -95,12 +95,14 @@ struct smb_vol { unsigned int sockopt; unsigned short int port; char *prepath; + u32 local_ip; /* allow binding to a local IP address if != 0 */ }; static int ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket, char *netb_name, - char *server_netb_name); + char *server_netb_name, + u32 local_ip); static int ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket); @@ -196,7 +198,8 @@ cifs_reconnect(struct TCP_Server_Info *server) rc = ipv4_connect(&server->addr.sockAddr, &server->ssocket, server->workstation_RFC1001_name, - server->server_RFC1001_name); + server->server_RFC1001_name, + server->ip4_local_ip); } if (rc) { cFYI(1, ("reconnect error %d", rc)); @@ -1014,6 +1017,18 @@ cifs_parse_mount_options(char *options, const char *devname, "long\n"); return 1; } + } else if (strnicmp(data, "local_ip", 8) == 0) { + if (!value || !*value) { + printk(KERN_WARNING "CIFS: local_ip value not specified.\n"); + return 1; /* needs_arg; */ + } + i = cifs_inet_pton(AF_INET, value, &(vol->local_ip)); + if (i < 0) { + vol->local_ip = 0; + printk(KERN_WARNING "CIFS: Could not parse local_ip: %s\n", + value); + return 1; + } } else if (strnicmp(data, "prefixpath", 10) == 0) { if (!value || !*value) { printk(KERN_WARNING @@ -1295,7 +1310,8 @@ cifs_parse_mount_options(char *options, const char *devname, static struct cifsSesInfo * cifs_find_tcp_session(struct in_addr *target_ip_addr, struct in6_addr *target_ip6_addr, - char *userName, struct TCP_Server_Info **psrvTcp) + char *userName, struct TCP_Server_Info **psrvTcp, + u32 local_ip) { struct list_head *tmp; struct cifsSesInfo *ses; @@ -1305,7 +1321,11 @@ cifs_find_tcp_session(struct in_addr *target_ip_addr, list_for_each(tmp, &GlobalSMBSessionList) { ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList); if (ses->server) { - if ((target_ip_addr && + if((target_ip_addr && + /* If binding to a local IP, do not re-use sessions bound to different + * local IP addresses. + */ + (local_ip == ses->server->ip4_local_ip) && (ses->server->addr.sockAddr.sin_addr.s_addr == target_ip_addr->s_addr)) || (target_ip6_addr && memcmp(&ses->server->addr.sockAddr6.sin6_addr, @@ -1334,7 +1354,7 @@ cifs_find_tcp_session(struct in_addr *target_ip_addr, } static struct cifsTconInfo * -find_unc(__be32 new_target_ip_addr, char *uncName, char *userName) +find_unc(__be32 new_target_ip_addr, char *uncName, char *userName, u32 local_ip) { struct list_head *tmp; struct cifsTconInfo *tcon; @@ -1349,8 +1369,9 @@ find_unc(__be32 new_target_ip_addr, char *uncName, char *userName) ("old ip addr: %x == new ip %x ?", tcon->ses->server->addr.sockAddr.sin_addr. s_addr, new_target_ip_addr)); - if (tcon->ses->server->addr.sockAddr.sin_addr. - s_addr == new_target_ip_addr) { + if ((local_ip == tcon->ses->server->ip4_local_ip) && + (tcon->ses->server->addr.sockAddr.sin_addr. + s_addr == new_target_ip_addr)) { /* BB lock tcon, server and tcp session and increment use count here? */ /* found a match on the TCP session */ /* BB check if reconnection needed */ @@ -1453,7 +1474,8 @@ static void rfc1002mangle(char *target, char *source, unsigned int length) static int ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket, - char *netbios_name, char *target_name) + char *netbios_name, char *target_name, + u32 local_ip /* in network byte order */) { int rc = 0; int connected = 0; @@ -1473,6 +1495,24 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket, } } + /* Bind to the local IP address if specified */ + if (local_ip) { + struct sockaddr_in myaddr = { + .sin_family = AF_INET, + }; + myaddr.sin_addr.s_addr = local_ip; + myaddr.sin_port = 0; /* any */ + rc = (*csocket)->ops->bind(*csocket, (struct sockaddr *) &myaddr, + sizeof(myaddr)); + if (rc < 0) { + printk("Tried to bind to local ip: 0x%x, but failed with error: %d\n", + local_ip, rc); + } + else { + printk("CIFS: Successfully bound to local ip: 0x%x\n", local_ip); + } + } + psin_server->sin_family = AF_INET; if (psin_server->sin_port) { /* user overrode default port */ rc = (*csocket)->ops->connect(*csocket, @@ -1867,12 +1907,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, if (address_type == AF_INET) existingCifsSes = cifs_find_tcp_session(&sin_server.sin_addr, NULL /* no ipv6 addr */, - volume_info.username, &srvTcp); + volume_info.username, &srvTcp, volume_info.local_ip); else if (address_type == AF_INET6) { cFYI(1, ("looking for ipv6 address")); existingCifsSes = cifs_find_tcp_session(NULL /* no ipv4 addr */, &sin_server6.sin6_addr, - volume_info.username, &srvTcp); + volume_info.username, &srvTcp, 0); } else { kfree(volume_info.UNC); kfree(volume_info.password); @@ -1896,7 +1936,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, } else rc = ipv4_connect(&sin_server, &csocket, volume_info.source_rfc1001_name, - volume_info.target_rfc1001_name); + volume_info.target_rfc1001_name, + volume_info.local_ip); if (rc < 0) { cERROR(1, ("Error connecting to IPv4 socket. " "Aborting operation")); @@ -1926,6 +1967,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, /* BB Add code for ipv6 case too */ srvTcp->ssocket = csocket; srvTcp->protocolType = IPV4; + srvTcp->ip4_local_ip = volume_info.local_ip; init_waitqueue_head(&srvTcp->response_q); init_waitqueue_head(&srvTcp->request_q); INIT_LIST_HEAD(&srvTcp->pending_mid_q); @@ -2077,7 +2119,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, tcon = find_unc(sin_server.sin_addr.s_addr, volume_info.UNC, - volume_info.username); + volume_info.username, volume_info.local_ip); if (tcon) { cFYI(1, ("Found match on UNC path")); /* we can have only one retry value for a connection diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 37310b0..8f31d3b 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -2724,6 +2724,11 @@ COMPATIBLE_IOCTL(SIOCGMIIREG) COMPATIBLE_IOCTL(SIOCSMIIREG) COMPATIBLE_IOCTL(SIOCGIFVLAN) COMPATIBLE_IOCTL(SIOCSIFVLAN) +COMPATIBLE_IOCTL(SIOCSIFMACVLAN) +COMPATIBLE_IOCTL(SIOCGIFMACVLAN) +COMPATIBLE_IOCTL(SIOCGIFREDIRDEV) +COMPATIBLE_IOCTL(SIOCSIFREDIRDEV) +COMPATIBLE_IOCTL(0x7450 /* GET_PKTGEN_INTERFACE_INFO */) COMPATIBLE_IOCTL(SIOCBRADDBR) COMPATIBLE_IOCTL(SIOCBRDELBR) /* SG stuff */ diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index a796be5..9ded0e2 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -169,11 +169,12 @@ void nfs_callback_down(void) static int nfs_callback_authenticate(struct svc_rqst *rqstp) { struct sockaddr_in *addr = svc_addr_in(rqstp); + struct sockaddr_in *clientaddr = (struct sockaddr_in *)&rqstp->rq_daddr.addr; struct nfs_client *clp; char buf[RPC_MAX_ADDRBUFLEN]; /* Don't talk to strangers */ - clp = nfs_find_client(addr, 4); + clp = nfs_find_client(addr, clientaddr, 4); if (clp == NULL) return SVC_DROP; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index c2bb14e..dfe1e57 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -39,6 +39,7 @@ struct cb_compound_hdr_res { struct cb_getattrargs { struct sockaddr_in *addr; + struct sockaddr_in *clientaddr; struct nfs_fh fh; uint32_t bitmap[2]; }; @@ -54,6 +55,7 @@ struct cb_getattrres { struct cb_recallargs { struct sockaddr_in *addr; + struct sockaddr_in *clientaddr; struct nfs_fh fh; nfs4_stateid stateid; uint32_t truncate; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 72e55d8..0343165 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -23,7 +23,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres * res->bitmap[0] = res->bitmap[1] = 0; res->status = htonl(NFS4ERR_BADHANDLE); - clp = nfs_find_client(args->addr, 4); + clp = nfs_find_client(args->addr, args->clientaddr, 4); if (clp == NULL) goto out; inode = nfs_delegation_find_inode(clp, &args->fh); @@ -62,7 +62,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) __be32 res; res = htonl(NFS4ERR_BADHANDLE); - clp = nfs_find_client(args->addr, 4); + clp = nfs_find_client(args->addr, args->clientaddr, 4); if (clp == NULL) goto out; inode = nfs_delegation_find_inode(clp, &args->fh); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 058ade7..dbc0b58 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -177,6 +177,7 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr if (unlikely(status != 0)) goto out; args->addr = svc_addr_in(rqstp); + args->clientaddr = (struct sockaddr_in *)&rqstp->rq_daddr.addr; status = decode_bitmap(xdr, args->bitmap); out: dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status)); @@ -189,6 +190,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, __be32 status; args->addr = svc_addr_in(rqstp); + args->clientaddr = (struct sockaddr_in *)&rqstp->rq_daddr.addr; status = decode_stateid(xdr, &args->stateid); if (unlikely(status != 0)) goto out; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index a204484..00a1598 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -99,6 +99,7 @@ struct rpc_program nfsacl_program = { */ static struct nfs_client *nfs_alloc_client(const char *hostname, const struct sockaddr_in *addr, + const struct sockaddr_in *clientaddr, int nfsversion) { struct nfs_client *clp; @@ -205,7 +206,9 @@ void nfs_put_client(struct nfs_client *clp) * Find a client by address * - caller must hold nfs_client_lock */ -static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion, int match_port) +static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, + const struct sockaddr_in *clientaddr, + int nfsversion, int match_port) { struct nfs_client *clp; @@ -222,6 +225,10 @@ static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int sizeof(clp->cl_addr.sin_addr)) != 0) continue; + if (memcmp(&clp->cl_ipaddr.sin_addr, &clientaddr->sin_addr, + sizeof(clp->cl_ipaddr.sin_addr)) != 0) + continue; + if (!match_port || clp->cl_addr.sin_port == addr->sin_port) goto found; } @@ -237,12 +244,14 @@ found: * Find a client by IP address and protocol version * - returns NULL if no such client */ -struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversion) +struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, + const struct sockaddr_in *clientaddr, + int nfsversion) { struct nfs_client *clp; spin_lock(&nfs_client_lock); - clp = __nfs_find_client(addr, nfsversion, 0); + clp = __nfs_find_client(addr, clientaddr, nfsversion, 0); spin_unlock(&nfs_client_lock); if (clp != NULL && clp->cl_cons_state != NFS_CS_READY) { nfs_put_client(clp); @@ -257,6 +266,7 @@ struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversio */ static struct nfs_client *nfs_get_client(const char *hostname, const struct sockaddr_in *addr, + const struct sockaddr_in *clientaddr, int nfsversion) { struct nfs_client *clp, *new = NULL; @@ -270,7 +280,7 @@ static struct nfs_client *nfs_get_client(const char *hostname, do { spin_lock(&nfs_client_lock); - clp = __nfs_find_client(addr, nfsversion, 1); + clp = __nfs_find_client(addr, clientaddr, nfsversion, 1); if (clp) goto found_client; if (new) @@ -278,7 +288,7 @@ static struct nfs_client *nfs_get_client(const char *hostname, spin_unlock(&nfs_client_lock); - new = nfs_alloc_client(hostname, addr, nfsversion); + new = nfs_alloc_client(hostname, addr, clientaddr, nfsversion); } while (new); return ERR_PTR(-ENOMEM); @@ -374,6 +384,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp, int proto, struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { .protocol = proto, + .saddress = (struct sockaddr *)&clp->cl_ipaddr, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = sizeof(clp->cl_addr), .timeout = &timeparms, @@ -518,6 +529,8 @@ static int nfs_init_client(struct nfs_client *clp, const struct nfs_mount_data * if (clp->cl_nfsversion == 3) clp->rpc_ops = &nfs_v3_clientops; #endif + memcpy(&clp->cl_ipaddr, &data->clientaddr, sizeof(clp->cl_ipaddr)); + /* * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 @@ -551,7 +564,8 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_dat #endif /* Allocate or find a client reference we can use */ - clp = nfs_get_client(data->hostname, &data->addr, nfsvers); + clp = nfs_get_client(data->hostname, &data->addr, &data->clientaddr, + nfsvers); if (IS_ERR(clp)) { dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); return PTR_ERR(clp); @@ -830,7 +844,7 @@ error: */ static int nfs4_init_client(struct nfs_client *clp, int proto, int timeo, int retrans, - const char *ip_addr, + const struct sockaddr_in *clientaddr, rpc_authflavor_t authflavour) { int error; @@ -843,12 +857,12 @@ static int nfs4_init_client(struct nfs_client *clp, /* Check NFS protocol revision and initialize RPC op vector */ clp->rpc_ops = &nfs_v4_clientops; + memcpy(&clp->cl_ipaddr, clientaddr, sizeof(clp->cl_ipaddr)); error = nfs_create_rpc_client(clp, proto, timeo, retrans, authflavour, RPC_CLNT_CREATE_DISCRTRY); if (error < 0) goto error; - memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); error = nfs_idmap_new(clp); if (error < 0) { @@ -872,7 +886,7 @@ error: */ static int nfs4_set_client(struct nfs_server *server, const char *hostname, const struct sockaddr_in *addr, - const char *ip_addr, + const struct sockaddr_in *clientaddr, rpc_authflavor_t authflavour, int proto, int timeo, int retrans) { @@ -882,12 +896,13 @@ static int nfs4_set_client(struct nfs_server *server, dprintk("--> nfs4_set_client()\n"); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(hostname, addr, 4); + clp = nfs_get_client(hostname, addr, clientaddr, 4); if (IS_ERR(clp)) { error = PTR_ERR(clp); goto error; } - error = nfs4_init_client(clp, proto, timeo, retrans, ip_addr, authflavour); + error = nfs4_init_client(clp, proto, timeo, retrans, clientaddr, + authflavour); if (error < 0) goto error_put; @@ -941,7 +956,7 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data, const char *hostname, const struct sockaddr_in *addr, const char *mntpath, - const char *ip_addr, + const struct sockaddr_in *clientaddr, rpc_authflavor_t authflavour, struct nfs_fh *mntfh) { @@ -956,7 +971,7 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data, return ERR_PTR(-ENOMEM); /* Get a client record */ - error = nfs4_set_client(server, hostname, addr, ip_addr, authflavour, + error = nfs4_set_client(server, hostname, addr, clientaddr, authflavour, data->proto, data->timeo, data->retrans); if (error < 0) goto error; @@ -1029,7 +1044,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, /* Get a client representation. * Note: NFSv4 always uses TCP, */ error = nfs4_set_client(server, data->hostname, data->addr, - parent_client->cl_ipaddr, + &parent_client->cl_ipaddr, data->authflavor, parent_server->client->cl_xprt->prot, parent_client->retrans_timeo, diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 76cf55d..0043eab 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -31,14 +31,15 @@ struct nfs_clone_mount { extern struct rpc_program nfs_program; extern void nfs_put_client(struct nfs_client *); -extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int); +extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, + const struct sockaddr_in *, int); extern struct nfs_server *nfs_create_server(const struct nfs_mount_data *, struct nfs_fh *); extern struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *, const char *, const struct sockaddr_in *, const char *, - const char *, + const struct sockaddr_in *, rpc_authflavor_t, struct nfs_fh *); extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 8afd9f7..54b8096 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -30,6 +30,7 @@ struct mnt_fhstatus { * nfs_mount - Obtain an NFS file handle for the given host and path * @addr: pointer to server's address * @len: size of server's address + * @clientaddr: pointer to our address * @hostname: name of server host, or NULL * @path: pointer to string containing export path to mount * @version: mount version to use for this request @@ -38,8 +39,9 @@ struct mnt_fhstatus { * * Uses default timeout parameters specified by underlying transport. */ -int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path, - int version, int protocol, struct nfs_fh *fh) +int nfs_mount(struct sockaddr *addr, size_t len, struct sockaddr *clientaddr, + char *hostname, char *path, int version, int protocol, + struct nfs_fh *fh) { struct mnt_fhstatus result = { .fh = fh @@ -50,6 +52,7 @@ int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path, }; struct rpc_create_args args = { .protocol = protocol, + .saddress = clientaddr, .address = addr, .addrsize = len, .servername = hostname, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4b90e17..3336828 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2916,15 +2916,16 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po for(;;) { setclientid.sc_name_len = scnprintf(setclientid.sc_name, - sizeof(setclientid.sc_name), "%s/%u.%u.%u.%u %s %u", - clp->cl_ipaddr, NIPQUAD(clp->cl_addr.sin_addr), + sizeof(setclientid.sc_name), "%u.%u.%u.%u/%u.%u.%u.%u %s %u", + NIPQUAD(clp->cl_ipaddr), + NIPQUAD(clp->cl_addr.sin_addr), cred->cr_ops->cr_name, clp->cl_id_uniquifier); setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, sizeof(setclientid.sc_netid), "tcp"); setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, - sizeof(setclientid.sc_uaddr), "%s.%d.%d", - clp->cl_ipaddr, port >> 8, port & 255); + sizeof(setclientid.sc_uaddr), "%u.%u.%u.%u.%d.%d", + NIPQUAD(clp->cl_ipaddr), port >> 8, port & 255); status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); if (status != -NFS4ERR_CLID_INUSE) diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 3490322..820a75e 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -496,7 +496,7 @@ static int __init root_nfs_get_handle(void) NFS_MNT3_VERSION : NFS_MNT_VERSION; set_sockaddr(&sin, servaddr, htons(mount_port)); - status = nfs_mount((struct sockaddr *) &sin, sizeof(sin), NULL, + status = nfs_mount((struct sockaddr *) &sin, sizeof(sin), NULL, NULL, nfs_path, version, protocol, &fh); if (status < 0) printk(KERN_ERR "Root-NFS: Server returned error %d " diff --git a/fs/nfs/super.c b/fs/nfs/super.c index b878528..c71adba 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -69,7 +69,7 @@ struct nfs_parsed_mount_data { unsigned int bsize; unsigned int auth_flavor_len; rpc_authflavor_t auth_flavors[1]; - char *client_address; + struct sockaddr_in client_address; struct { struct sockaddr_in address; @@ -959,7 +959,8 @@ static int nfs_parse_mount_options(char *raw, string = match_strdup(args); if (string == NULL) goto out_nomem; - mnt->client_address = string; + mnt->client_address.sin_family = AF_INET; + mnt->client_address.sin_addr.s_addr = in_aton(string); break; case Opt_mounthost: string = match_strdup(args); @@ -1044,6 +1045,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, */ status = nfs_mount((struct sockaddr *) &sin, sizeof(sin), + (struct sockaddr *) &args->client_address, args->nfs_server.hostname, args->nfs_server.export_path, args->mount_server.version, @@ -1565,7 +1567,7 @@ static int nfs4_validate_mount_data(struct nfs4_mount_data **options, rpc_authflavor_t *authflavour, char **hostname, char **mntpath, - char **ip_addr) + struct sockaddr_in *ip_addr) { struct nfs4_mount_data *data = *options; char *c; @@ -1611,7 +1613,9 @@ static int nfs4_validate_mount_data(struct nfs4_mount_data **options, c = strndup_user(data->client_addr.data, 16); if (IS_ERR(c)) return PTR_ERR(c); - *ip_addr = c; + ip_addr->sin_family = AF_INET; + ip_addr->sin_addr.s_addr = in_aton(c); + kfree(c); break; default: { @@ -1694,7 +1698,7 @@ static int nfs4_validate_mount_data(struct nfs4_mount_data **options, dprintk("MNTPATH: %s\n", *mntpath); - if (args.client_address == NULL) + if (args.client_address.sin_addr.s_addr == 0) goto out_no_client_address; *ip_addr = args.client_address; @@ -1732,11 +1736,11 @@ static int nfs4_get_sb(struct file_system_type *fs_type, struct nfs4_mount_data *data = raw_data; struct super_block *s; struct nfs_server *server; - struct sockaddr_in addr; + struct sockaddr_in addr, clientaddr; rpc_authflavor_t authflavour; struct nfs_fh mntfh; struct dentry *mntroot; - char *mntpath = NULL, *hostname = NULL, *ip_addr = NULL; + char *mntpath = NULL, *hostname = NULL; int (*compare_super)(struct super_block *, void *) = nfs_compare_super; struct nfs_sb_mountdata sb_mntdata = { .mntflags = flags, @@ -1745,12 +1749,12 @@ static int nfs4_get_sb(struct file_system_type *fs_type, /* Validate the mount data */ error = nfs4_validate_mount_data(&data, dev_name, &addr, &authflavour, - &hostname, &mntpath, &ip_addr); + &hostname, &mntpath, &clientaddr); if (error < 0) goto out; /* Get a volume representation */ - server = nfs4_create_server(data, hostname, &addr, mntpath, ip_addr, + server = nfs4_create_server(data, hostname, &addr, mntpath, &clientaddr, authflavour, &mntfh); if (IS_ERR(server)) { error = PTR_ERR(server); @@ -1790,7 +1794,6 @@ static int nfs4_get_sb(struct file_system_type *fs_type, error = 0; out: - kfree(ip_addr); kfree(mntpath); kfree(hostname); return error; diff --git a/include/asm-i386/socket.h b/include/asm-i386/socket.h index 99ca648..2f7e5c7 100644 --- a/include/asm-i386/socket.h +++ b/include/asm-i386/socket.h @@ -52,4 +52,8 @@ #define SO_TIMESTAMPNS 35 #define SCM_TIMESTAMPNS SO_TIMESTAMPNS +/* Instruct lower device to not calculate the frame + * checksum. Useful only for testing, afaik. --Ben */ +#define SO_NOFCS 50 + #endif /* _ASM_SOCKET_H */ diff --git a/include/asm-x86_64/socket.h b/include/asm-x86_64/socket.h index 90af60c..8ad84d1 100644 --- a/include/asm-x86_64/socket.h +++ b/include/asm-x86_64/socket.h @@ -52,4 +52,9 @@ #define SO_TIMESTAMPNS 35 #define SCM_TIMESTAMPNS SO_TIMESTAMPNS +/* Instruct lower device to not calculate the frame + * checksum. Useful only for testing, afaik. --Ben */ +#define SO_NOFCS 50 + + #endif /* _ASM_SOCKET_H */ diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 23ccea8..f076359 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1,4 +1,4 @@ -/* +/* -*-linux-c-*- * ethtool.h: Defines for Linux ethtool. * * Copyright (C) 1998 David S. Miller (davem@redhat.com) @@ -307,6 +307,11 @@ int ethtool_op_set_ufo(struct net_device *dev, u32 data); * get_strings: Return a set of strings that describe the requested objects * phys_id: Identify the device * get_stats: Return statistics about the device + * set_rx_all: Set or clear IFF_ACCEPT_ALL_FRAMES, see if.h + * get_rx_all: Return 1 if set, 0 if not. + * set_save_fcs: Set or clear IFF_SAVE_FCS, see if.h + * get_save_fcs: Return 1 if set, 0 if not. + * * * Description: * @@ -365,6 +370,10 @@ struct ethtool_ops { int (*phys_id)(struct net_device *, u32); int (*get_stats_count)(struct net_device *); void (*get_ethtool_stats)(struct net_device *, struct ethtool_stats *, u64 *); + int (*set_rx_all)(struct net_device *, u32); + int (*get_rx_all)(struct net_device *, u32 *); + int (*set_save_fcs)(struct net_device *, u32); + int (*get_save_fcs)(struct net_device *, u32 *); int (*begin)(struct net_device *); void (*complete)(struct net_device *); u32 (*get_ufo)(struct net_device *); @@ -372,6 +381,13 @@ struct ethtool_ops { }; #endif /* __KERNEL__ */ +/* for dumping net-device statistics */ +struct ethtool_ndstats { + u32 cmd; /* ETHTOOL_GNDSTATS */ + u8 data[0]; /* sizeof(struct net_device_stats) */ +}; + + /* CMDs currently supported */ #define ETHTOOL_GSET 0x00000001 /* Get settings. */ #define ETHTOOL_SSET 0x00000002 /* Set settings. */ @@ -411,6 +427,15 @@ struct ethtool_ops { #define ETHTOOL_GGSO 0x00000023 /* Get GSO enable (ethtool_value) */ #define ETHTOOL_SGSO 0x00000024 /* Set GSO enable (ethtool_value) */ + +#define ETHTOOL_GNDSTATS 0x00000070 /* get standard net-device statistics */ +#define ETHTOOL_GETRXALL 0x00000071 /* Retrieve whether or not + * IFF_ACCEPT_ALL_FRAMES is set. */ +#define ETHTOOL_SETRXALL 0x00000072 /* Set IFF_ACCEPT_ALL_FRAMES */ +#define ETHTOOL_GETRXFCS 0x00000073 /* Set IFF_SAVE_FCS */ +#define ETHTOOL_SETRXFCS 0x00000074 /* Set IFF_SAVE_FCS */ + + /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET #define SPARC_ETH_SSET ETHTOOL_SSET diff --git a/include/linux/if.h b/include/linux/if.h index 32bf419..ce8f1d3 100644 --- a/include/linux/if.h +++ b/include/linux/if.h @@ -62,6 +62,14 @@ #define IFF_BONDING 0x20 /* bonding master or slave */ #define IFF_SLAVE_NEEDARP 0x40 /* need ARPs for validation */ +#define IFF_ACCEPT_ALL_FRAMES 0x0400 /** Accept all frames, even ones with bad CRCs. + * Should only be used in debugging/testing situations + * Do NOT enable this unless you understand the + * consequences! */ +#define IFF_SAVE_FCS 0x0800 /** Save the Frame Check Sum (FCS) on receive, if + * possible. */ +#define IFF_MAC_VLAN 0x1000 /* MAC VLAN device. */ + #define IF_GET_IFACE 0x0001 /* for querying only */ #define IF_GET_PROTO 0x0002 diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 0d9d7ea..1a0f9ef 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -5,5 +5,13 @@ extern struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *); +struct macvlan_dev { + struct net_device *dev; + struct list_head list; + struct hlist_node hlist; + struct macvlan_port *port; + struct net_device *lowerdev; +}; + #endif /* __KERNEL__ */ #endif /* _LINUX_IF_MACVLAN_H */ diff --git a/include/linux/if_redirdev.h b/include/linux/if_redirdev.h new file mode 100644 index 0000000..cf8055c --- /dev/null +++ b/include/linux/if_redirdev.h @@ -0,0 +1,35 @@ +/* -*- linux-c -*- */ +#ifndef _LINUX_IF_REDIRDEV_H +#define _LINUX_IF_REDIRDEV_H + +/* the ioctl commands */ + +#define REDIRDEV_ADD 2090 +#define REDIRDEV_DEL 2091 +/* If this IOCTL succeedes, we are a Redirect-Device + interface, otherwise, we are not. */ +#define REDIRDEV_IS_REDIRDEV 2092 +#define REDIRDEV_GET_BY_IDX 2093 +#define REDIRDEV_GET_BY_NAME 2094 +#define REDIRDEV_SET_QUOTA 2095 + +#ifdef __KERNEL__ +#include +#include +extern int (*redirdev_ioctl_hook)(void*); + +#endif + +/* Request and response */ +struct redirdev_ioctl { + u32 cmd; + u32 ifidx; /* when getting info by idx */ + +#define RDD_ASSOCIATED (1<<0) + u32 flags; /* 1<<0: Is the interface associated with tx-dev or not */ + u32 not_used; /* explicitly align 64-bit */ + char ifname[IFNAMSIZ]; + char txifname[IFNAMSIZ]; +}; + +#endif diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index d83fee2..7ad0fab 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -107,7 +107,8 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) #define IN_DEV_ARPFILTER(in_dev) IN_DEV_ORCONF((in_dev), ARPFILTER) #define IN_DEV_ARP_ANNOUNCE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE) #define IN_DEV_ARP_IGNORE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_IGNORE) - +#define IN_DEV_ACCEPT_STS(in_dev) IN_DEV_MAXCONF((in_dev), ACCEPT_STS) + struct in_ifaddr { struct in_ifaddr *ifa_next; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e679b27..62218b3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -174,6 +174,7 @@ enum { struct neighbour; struct neigh_parms; struct sk_buff; +struct pktgen_dev; struct netif_rx_stats { @@ -341,8 +342,14 @@ struct net_device #define NETIF_F_GSO 2048 /* Enable software GSO. */ #define NETIF_F_LLTX 4096 /* LockLess TX */ #define NETIF_F_MULTI_QUEUE 16384 /* Has multiple TX/RX queues */ +#define NETIF_F_RX_ALL 32768 /* Can be configured to receive all packets, even + * ones with busted CRC. May disable VLAN filtering + * in the NIC, users should NOT enable this feature + * unless they understand the consequences. */ +#define NETIF_F_SAVE_CRC 65536 /* Can save FCS in skb, last 4 bytes for ethernet */ - /* Segmentation offload features */ + +/* Segmentation offload features */ #define NETIF_F_GSO_SHIFT 16 #define NETIF_F_GSO_MASK 0xffff0000 #define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT) @@ -559,7 +566,15 @@ struct net_device #ifdef CONFIG_NET_POLL_CONTROLLER void (*poll_controller)(struct net_device *dev); #endif - + /* Callback for when the queue is woken, used by pktgen currently */ + int (*notify_queue_woken)(struct net_device *dev); + void* nqw_data; /* To be used by the method above as needed */ + + struct pktgen_dev* pkt_dev; /* to quickly find the pkt-gen dev registered with this + * interface, if any. + */ + long dflt_skb_mark; /* Specify skb->mark for pkts received on this interface. */ + /* bridge stuff */ struct net_bridge_port *br_port; /* macvlan */ @@ -717,8 +732,13 @@ static inline void netif_wake_queue(struct net_device *dev) return; } #endif - if (test_and_clear_bit(__LINK_STATE_XOFF, &dev->state)) + if (test_and_clear_bit(__LINK_STATE_XOFF, &dev->state)) { __netif_schedule(dev); + + if (dev->notify_queue_woken) { + dev->notify_queue_woken(dev); + } + } } static inline void netif_stop_queue(struct net_device *dev) diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 7250eea..a1c364b 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -496,8 +496,8 @@ static inline void nfs3_forget_cached_acls(struct inode *inode) /* * linux/fs/mount_clnt.c */ -extern int nfs_mount(struct sockaddr *, size_t, char *, char *, - int, int, struct nfs_fh *); +extern int nfs_mount(struct sockaddr *, size_t, struct sockaddr *, + char *, char *, int, int, struct nfs_fh *); /* * inline functions diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 0cac49b..67ecf3e 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -59,10 +59,10 @@ struct nfs_client { /* idmapper */ struct idmap * cl_idmap; - /* Our own IP address, as a null-terminated string. + /* Our own IP address. * This is used to generate the clientid, and the callback address. */ - char cl_ipaddr[16]; + struct sockaddr_in cl_ipaddr; unsigned char cl_id_uniquifier; #endif }; diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h index a3ade89..fc3e9eb 100644 --- a/include/linux/nfs_mount.h +++ b/include/linux/nfs_mount.h @@ -20,7 +20,7 @@ * mount-to-kernel version compatibility. Some of these aren't used yet * but here they are anyway. */ -#define NFS_MOUNT_VERSION 6 +#define NFS_MOUNT_VERSION 7 #define NFS_MAX_CONTEXT_LEN 256 struct nfs_mount_data { @@ -43,6 +43,7 @@ struct nfs_mount_data { struct nfs3_fh root; /* 4 */ int pseudoflavor; /* 5 */ char context[NFS_MAX_CONTEXT_LEN + 1]; /* 6 */ + struct sockaddr_in clientaddr; /* 7 */ }; /* bits in the flags field */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ed2c458..2afbeb0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -287,7 +287,8 @@ struct sk_buff { __u8 pkt_type:3, fclone:2, ipvs_property:1, - nf_trace:1; + nf_trace:1, + use_specified_ether_crc:1; __be16 protocol; void (*destructor)(struct sk_buff *skb); diff --git a/include/linux/sockios.h b/include/linux/sockios.h index abef759..367287c 100644 --- a/include/linux/sockios.h +++ b/include/linux/sockios.h @@ -94,6 +94,13 @@ #define SIOCGRARP 0x8961 /* get RARP table entry */ #define SIOCSRARP 0x8962 /* set RARP table entry */ +/* MAC address based VLAN control calls */ +#define SIOCGIFMACVLAN 0x8965 /* Mac address multiplex/demultiplex support */ +#define SIOCSIFMACVLAN 0x8966 /* Set macvlan options */ + +#define SIOCGIFREDIRDEV 0x8967 /* Redirect device get ioctl */ +#define SIOCSIFREDIRDEV 0x8968 /* Set redirect dev options */ + /* Driver configuration calls */ #define SIOCGIFMAP 0x8970 /* Get device parameters */ @@ -122,6 +129,7 @@ #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ #define SIOCBRDELIF 0x89a3 /* remove interface from bridge */ + /* Device private ioctl calls */ /* diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 483050c..d568f62 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -496,6 +496,7 @@ enum NET_IPV4_CONF_ARP_IGNORE=19, NET_IPV4_CONF_PROMOTE_SECONDARIES=20, NET_IPV4_CONF_ARP_ACCEPT=21, + NET_IPV4_CONF_ACCEPT_STS=22, __NET_IPV4_CONF_MAX }; diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h index c48e390..4922384 100644 --- a/include/net/netfilter/nf_conntrack_tuple.h +++ b/include/net/netfilter/nf_conntrack_tuple.h @@ -98,6 +98,8 @@ struct nf_conntrack_tuple /* The direction (for tuplehash) */ u_int8_t dir; } dst; + + u_int32_t mark; }; struct nf_conntrack_tuple_mask @@ -148,7 +150,8 @@ static inline int nf_ct_tuple_src_equal(const struct nf_conntrack_tuple *t1, t1->src.u3.all[3] == t2->src.u3.all[3] && t1->src.u.all == t2->src.u.all && t1->src.l3num == t2->src.l3num && - t1->dst.protonum == t2->dst.protonum); + t1->dst.protonum == t2->dst.protonum && + t1->mark == t2->mark); } static inline int nf_ct_tuple_dst_equal(const struct nf_conntrack_tuple *t1, @@ -160,7 +163,8 @@ static inline int nf_ct_tuple_dst_equal(const struct nf_conntrack_tuple *t1, t1->dst.u3.all[3] == t2->dst.u3.all[3] && t1->dst.u.all == t2->dst.u.all && t1->src.l3num == t2->src.l3num && - t1->dst.protonum == t2->dst.protonum); + t1->dst.protonum == t2->dst.protonum && + t1->mark == t2->mark); } static inline int nf_ct_tuple_equal(const struct nf_conntrack_tuple *t1, @@ -195,7 +199,8 @@ static inline int nf_ct_tuple_src_mask_cmp(const struct nf_conntrack_tuple *t1, return 0; if (t1->src.l3num != t2->src.l3num || - t1->dst.protonum != t2->dst.protonum) + t1->dst.protonum != t2->dst.protonum || + t1->mark != t2->mark) return 0; return 1; diff --git a/include/net/sock.h b/include/net/sock.h index 8a71ab3..2a1291b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -393,6 +393,10 @@ enum sock_flags { SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ + SOCK_DONT_DO_LL_FCS, /* Tell NIC not to do the ethernet FCS. Will use + * last 4 bytes of packet sent from user-space + * instead. + */ }; static inline void sock_copy_flags(struct sock *nsk, struct sock *osk) diff --git a/kernel/panic.c b/kernel/panic.c index f64f4c1..61a92ed 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -176,7 +176,7 @@ const char *print_tainted(void) void add_taint(unsigned flag) { - debug_locks = 0; /* can't trust the integrity of the kernel anymore */ + /* debug_locks = 0; --Ben */ /* can't trust the integrity of the kernel anymore */ tainted |= flag; } EXPORT_SYMBOL(add_taint); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 21af441..d599a07 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -3,7 +3,8 @@ * Ethernet-type device handling. * * Authors: Ben Greear - * Please send support related email to: vlan@scry.wanfear.com + * Please send support related email to: vlan@candelatech.com + * after subscribing using the link below. * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: @@ -637,6 +638,11 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, } break; +#if 0 + /* Don't propagate management state from base dev to VLANs. If you do this, + * then if you 'ifconfig eth0 down; ifconfig eth0 up', you also lose all the + * routes for eth0.* VLANs. --Ben + */ case NETDEV_DOWN: /* Put all VLANs for this dev in the down state too. */ for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { @@ -666,7 +672,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, dev_change_flags(vlandev, flgs | IFF_UP); } break; - +#endif + case NETDEV_UNREGISTER: /* Delete all VLANs for this dev. */ for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 6f5e738..6401008 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -3,7 +3,8 @@ * Ethernet-type device handling. * * Authors: Ben Greear - * Please send support related email to: vlan@scry.wanfear.com + * Please send support related email to: vlan@candelatech.com + * after subscribing using the web page below. * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: Mar 22 2001: Martin Bokaemper @@ -453,6 +454,11 @@ int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) struct net_device_stats *stats = vlan_dev_get_stats(dev); struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); + /* Please note, dev_queue_xmit consumes the pkt regardless of the + * return value. So, will copy the skb first and free if successful. + */ + struct sk_buff* skb2 = skb_get(skb); + /* Handle non-VLAN frames if they are sent to us, for example by DHCP. * * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING @@ -483,6 +489,10 @@ int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) skb = __vlan_put_tag(skb, veth_TCI); if (!skb) { stats->tx_dropped++; + /* Free the extra copy, assuming this is a non-recoverable + * issue and we don't want calling code to retry. + */ + kfree_skb(skb2); return 0; } @@ -500,13 +510,24 @@ int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) veth->h_vlan_proto, veth->h_vlan_TCI, veth->h_vlan_encapsulated_proto); #endif - stats->tx_packets++; /* for statics only */ - stats->tx_bytes += skb->len; - skb->dev = VLAN_DEV_INFO(dev)->real_dev; - dev_queue_xmit(skb); - return 0; + { + int rv = dev_queue_xmit(skb); + if (rv == 0) { + /* Was success, need to free the skb reference since + * we bumped up the user count above. If there was an + * error instead, then the skb2 will not be freed, and so + * the calling code will be able to re-send it. + */ + + stats->tx_packets++; /* for statics only */ + stats->tx_bytes += skb2->len; + + kfree_skb(skb2); + } + return rv; + } } int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) diff --git a/net/Kconfig b/net/Kconfig index cdba08c..7494943 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -163,6 +163,7 @@ source "net/tipc/Kconfig" source "net/atm/Kconfig" source "net/bridge/Kconfig" source "net/8021q/Kconfig" +source "net/redir/Kconfig" source "net/decnet/Kconfig" source "net/llc/Kconfig" source "net/ipx/Kconfig" @@ -205,6 +206,14 @@ config NET_TCPPROBE To compile this code as a module, choose M here: the module will be called tcp_probe. +config SUPPORT_SEND_BAD_CRC + bool "Support Send Bad CRC (USE WITH CAUTION)" + ---help--- + When enabled, one can send a specially crafted packet to the ethernet + device via a raw socket and it will be sent with the last 4 bytes of + the packet as the ethernet CRC. Requires driver support. Current driver + support is limited to e100 and e1000. + endmenu endmenu diff --git a/net/Makefile b/net/Makefile index bbe7d2a..3a448da 100644 --- a/net/Makefile +++ b/net/Makefile @@ -49,6 +49,7 @@ obj-$(CONFIG_MAC80211) += mac80211/ obj-$(CONFIG_IEEE80211) += ieee80211/ obj-$(CONFIG_TIPC) += tipc/ obj-$(CONFIG_NETLABEL) += netlabel/ +obj-$(CONFIG_REDIRDEV) += redir/ obj-$(CONFIG_IUCV) += iucv/ obj-$(CONFIG_RFKILL) += rfkill/ obj-$(CONFIG_NET_9P) += 9p/ diff --git a/net/core/dev.c b/net/core/dev.c index a76021c..3f347fc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include #include @@ -119,6 +120,22 @@ #include #include +#if defined(CONFIG_NET_PKTGEN) || defined(CONFIG_NET_PKTGEN_MODULE) +#include "pktgen.h" + +#warning "Compiling dev.c for pktgen."; + +int (*handle_pktgen_hook)(struct sk_buff *skb) = NULL; +EXPORT_SYMBOL(handle_pktgen_hook); + +static __inline__ int handle_pktgen_rcv(struct sk_buff* skb) { + if (handle_pktgen_hook) { + return handle_pktgen_hook(skb); + } + return -1; +} +#endif + /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -1919,6 +1936,7 @@ static int ing_filter(struct sk_buff *skb) } #endif + int netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; @@ -1947,6 +1965,11 @@ int netif_receive_skb(struct sk_buff *skb) skb_reset_transport_header(skb); skb->mac_len = skb->network_header - skb->mac_header; + /* Set the default 'mark' for this skb. dflt_skb_mark may be set through + * the /sys/class/net/[dev-name]/dflt_skb_mark file. + */ + skb->mark = skb->dev->dflt_skb_mark; + pt_prev = NULL; rcu_read_lock(); @@ -1992,6 +2015,16 @@ ncls: if (!skb) goto out; +#if defined(CONFIG_NET_PKTGEN) || defined(CONFIG_NET_PKTGEN_MODULE) + if ((skb->dev->pkt_dev) && + (handle_pktgen_rcv(skb) >= 0)) { + /* Pktgen may consume the packet, no need to send + * to further protocols. + */ + goto out; + } +#endif + type = skb->protocol; list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) { if (ptype->type == type && @@ -4134,6 +4167,10 @@ EXPORT_SYMBOL(net_enable_timestamp); EXPORT_SYMBOL(net_disable_timestamp); EXPORT_SYMBOL(dev_get_flags); +#if defined(CONFIG_NET_PKTGEN) || defined(CONFIG_NET_PKTGEN_MODULE) +EXPORT_SYMBOL(handle_pktgen_rcv); +#endif + #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) EXPORT_SYMBOL(br_handle_frame_hook); EXPORT_SYMBOL(br_fdb_get_hook); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index c5e0593..095d1eb 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1,4 +1,4 @@ -/* +/* -*- linux-c -*- * net/core/ethtool.c - Ethtool ioctl handler * Copyright (c) 2003 Matthew Wilcox * @@ -35,6 +35,12 @@ u32 ethtool_op_get_tx_csum(struct net_device *dev) return (dev->features & NETIF_F_ALL_CSUM) != 0; } +u32 ethtool_op_get_rx_all(struct net_device *dev, u32* retval) +{ + *retval = ((dev->priv_flags & IFF_ACCEPT_ALL_FRAMES) != 0); + return 0; +} + int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) { if (data) @@ -783,6 +789,88 @@ static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) return 0; } + +static int ethtool_get_rx_all(struct net_device *dev, char *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GSG }; + int rv = 0; + + if (!dev->ethtool_ops->get_rx_all) + return -EOPNOTSUPP; + + if ((rv = dev->ethtool_ops->get_rx_all(dev, &edata.data)) < 0) { + return rv; + } + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + + +static int ethtool_set_rx_all(struct net_device *dev, void *useraddr) +{ + struct ethtool_value id; + + if (!dev->ethtool_ops->set_rx_all) + return -EOPNOTSUPP; + + if (copy_from_user(&id, useraddr, sizeof(id))) + return -EFAULT; + + return dev->ethtool_ops->set_rx_all(dev, id.data); +} + +static int ethtool_get_rx_fcs(struct net_device *dev, char *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GSG }; + int rv = 0; + + if (!dev->ethtool_ops->get_save_fcs) + return -EOPNOTSUPP; + + if ((rv = dev->ethtool_ops->get_save_fcs(dev, &edata.data)) < 0) { + return rv; + } + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + + +static int ethtool_set_rx_fcs(struct net_device *dev, void *useraddr) +{ + struct ethtool_value id; + + if (!dev->ethtool_ops->set_save_fcs) + return -EOPNOTSUPP; + + if (copy_from_user(&id, useraddr, sizeof(id))) + return -EFAULT; + + return dev->ethtool_ops->set_save_fcs(dev, id.data); +} + + +/* Handle some generic ethtool commands here */ +static int ethtool_get_netdev_stats(struct net_device *dev, void *useraddr) { + + struct ethtool_ndstats* nds = (struct ethtool_ndstats*)(useraddr); + + struct net_device_stats *stats = dev->get_stats(dev); + if (stats) { + if (copy_to_user(nds->data, stats, sizeof(*stats))) { + return -EFAULT; + } + } + else { + return -EOPNOTSUPP; + } + return 0; +} + + /* The main entry point in this file. Called from net/core/dev.c */ int dev_ethtool(struct ifreq *ifr) @@ -796,9 +884,6 @@ int dev_ethtool(struct ifreq *ifr) if (!dev || !netif_device_present(dev)) return -ENODEV; - if (!dev->ethtool_ops) - return -EOPNOTSUPP; - if (copy_from_user(ðcmd, useraddr, sizeof (ethcmd))) return -EFAULT; @@ -823,12 +908,25 @@ int dev_ethtool(struct ifreq *ifr) return -EPERM; } - if (dev->ethtool_ops->begin) + if (dev->ethtool_ops && dev->ethtool_ops->begin) if ((rc = dev->ethtool_ops->begin(dev)) < 0) return rc; old_features = dev->features; + /* Handle some generic operations that do not require specific + * ethtool handlers. + */ + switch (ethcmd) { + case ETHTOOL_GNDSTATS: + return ethtool_get_netdev_stats(dev, useraddr); + default: + break; + } + + if (!dev->ethtool_ops) + return -EOPNOTSUPP; + switch (ethcmd) { case ETHTOOL_GSET: rc = ethtool_get_settings(dev, useraddr); @@ -917,6 +1015,18 @@ int dev_ethtool(struct ifreq *ifr) case ETHTOOL_PHYS_ID: rc = ethtool_phys_id(dev, useraddr); break; + case ETHTOOL_SETRXALL: + rc = ethtool_set_rx_all(dev, useraddr); + break; + case ETHTOOL_GETRXALL: + rc = ethtool_get_rx_all(dev, useraddr); + break; + case ETHTOOL_SETRXFCS: + rc = ethtool_set_rx_fcs(dev, useraddr); + break; + case ETHTOOL_GETRXFCS: + rc = ethtool_get_rx_fcs(dev, useraddr); + break; case ETHTOOL_GSTATS: rc = ethtool_get_stats(dev, useraddr); break; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 5c19b06..4b75b0e 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -216,6 +216,20 @@ static ssize_t store_tx_queue_len(struct device *dev, return netdev_store(dev, attr, buf, len, change_tx_queue_len); } +NETDEVICE_SHOW(dflt_skb_mark, fmt_ulong); + +static int change_dflt_skb_mark(struct net_device *net, unsigned long new_val) +{ + net->dflt_skb_mark = new_val; + return 0; +} + +static ssize_t store_dflt_skb_mark(struct device *dev, struct device_attribute* attr, + const char* buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_dflt_skb_mark); +} + NETDEVICE_SHOW(weight, fmt_dec); static int change_weight(struct net_device *net, unsigned long new_weight) @@ -246,6 +260,8 @@ static struct device_attribute net_class_attributes[] = { __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags), __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, store_tx_queue_len), + __ATTR(dflt_skb_mark, S_IRUGO | S_IWUSR, show_dflt_skb_mark, + store_dflt_skb_mark), __ATTR(weight, S_IRUGO | S_IWUSR, show_weight, store_weight), {} }; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 803d0c8..0e630a8 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -166,231 +166,55 @@ #include #include /* do_div */ #include - +#include /* sched_clock() */ +#include "pktgen.h" + +#define USE_NQW_CALLBACK +#ifdef USE_NQW_CALLBACK +# include +# if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) +# include +# endif +#endif #define VERSION "pktgen v2.68: Packet Generator for packet performance testing.\n" -/* The buckets are exponential in 'width' */ -#define LAT_BUCKETS_MAX 32 -#define IP_NAME_SZ 32 -#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ -#define MPLS_STACK_BOTTOM htonl(0x00000100) - -/* Device flag bits */ -#define F_IPSRC_RND (1<<0) /* IP-Src Random */ -#define F_IPDST_RND (1<<1) /* IP-Dst Random */ -#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */ -#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */ -#define F_MACSRC_RND (1<<4) /* MAC-Src Random */ -#define F_MACDST_RND (1<<5) /* MAC-Dst Random */ -#define F_TXSIZE_RND (1<<6) /* Transmit size is random */ -#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */ -#define F_MPLS_RND (1<<8) /* Random MPLS labels */ -#define F_VID_RND (1<<9) /* Random VLAN ID */ -#define F_SVID_RND (1<<10) /* Random SVLAN ID */ -#define F_FLOW_SEQ (1<<11) /* Sequential flows */ -#define F_IPSEC_ON (1<<12) /* ipsec on for flows */ - -/* Thread control flag bits */ -#define T_TERMINATE (1<<0) -#define T_STOP (1<<1) /* Stop run */ -#define T_RUN (1<<2) /* Start run */ -#define T_REMDEVALL (1<<3) /* Remove all devs */ -#define T_REMDEV (1<<4) /* Remove one dev */ - -/* If lock -- can be removed after some work */ -#define if_lock(t) spin_lock(&(t->if_lock)); -#define if_unlock(t) spin_unlock(&(t->if_lock)); - -/* Used to help with determining the pkts on receive */ -#define PKTGEN_MAGIC 0xbe9be955 -#define PG_PROC_DIR "pktgen" -#define PGCTRL "pgctrl" static struct proc_dir_entry *pg_proc_dir = NULL; -#define MAX_CFLOWS 65536 - -#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4) -#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4) - -struct flow_state { - __be32 cur_daddr; - int count; -#ifdef CONFIG_XFRM - struct xfrm_state *x; -#endif - __u32 flags; -}; - -/* flow flag bits */ -#define F_INIT (1<<0) /* flow has been initialized */ - -struct pktgen_dev { - /* - * Try to keep frequent/infrequent used vars. separated. - */ - struct proc_dir_entry *entry; /* proc file */ - struct pktgen_thread *pg_thread;/* the owner */ - struct list_head list; /* Used for chaining in the thread's run-queue */ - - int running; /* if this changes to false, the test will stop */ - - /* If min != max, then we will either do a linear iteration, or - * we will do a random selection from within the range. - */ - __u32 flags; - int removal_mark; /* non-zero => the device is marked for - * removal by worker thread */ - - int min_pkt_size; /* = ETH_ZLEN; */ - int max_pkt_size; /* = ETH_ZLEN; */ - int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ - int nfrags; - __u32 delay_us; /* Default delay */ - __u32 delay_ns; - __u64 count; /* Default No packets to send */ - __u64 sofar; /* How many pkts we've sent so far */ - __u64 tx_bytes; /* How many bytes we've transmitted */ - __u64 errors; /* Errors when trying to transmit, pkts will be re-sent */ - - /* runtime counters relating to clone_skb */ - __u64 next_tx_us; /* timestamp of when to tx next */ - __u32 next_tx_ns; - - __u64 allocated_skbs; - __u32 clone_count; - int last_ok; /* Was last skb sent? - * Or a failed transmit of some sort? This will keep - * sequence numbers in order, for example. - */ - __u64 started_at; /* micro-seconds */ - __u64 stopped_at; /* micro-seconds */ - __u64 idle_acc; /* micro-seconds */ - __u32 seq_num; - - int clone_skb; /* Use multiple SKBs during packet gen. If this number - * is greater than 1, then that many copies of the same - * packet will be sent before a new packet is allocated. - * For instance, if you want to send 1024 identical packets - * before creating a new packet, set clone_skb to 1024. - */ - - char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ - char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ - char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ - char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ - - struct in6_addr in6_saddr; - struct in6_addr in6_daddr; - struct in6_addr cur_in6_daddr; - struct in6_addr cur_in6_saddr; - /* For ranges */ - struct in6_addr min_in6_daddr; - struct in6_addr max_in6_daddr; - struct in6_addr min_in6_saddr; - struct in6_addr max_in6_saddr; - - /* If we're doing ranges, random or incremental, then this - * defines the min/max for those ranges. - */ - __be32 saddr_min; /* inclusive, source IP address */ - __be32 saddr_max; /* exclusive, source IP address */ - __be32 daddr_min; /* inclusive, dest IP address */ - __be32 daddr_max; /* exclusive, dest IP address */ - - __u16 udp_src_min; /* inclusive, source UDP port */ - __u16 udp_src_max; /* exclusive, source UDP port */ - __u16 udp_dst_min; /* inclusive, dest UDP port */ - __u16 udp_dst_max; /* exclusive, dest UDP port */ - - /* DSCP + ECN */ - __u8 tos; /* six most significant bits of (former) IPv4 TOS are for dscp codepoint */ - __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 (see RFC 3260, sec. 4) */ - - /* MPLS */ - unsigned nr_labels; /* Depth of stack, 0 = no MPLS */ - __be32 labels[MAX_MPLS_LABELS]; - - /* VLAN/SVLAN (802.1Q/Q-in-Q) */ - __u8 vlan_p; - __u8 vlan_cfi; - __u16 vlan_id; /* 0xffff means no vlan tag */ - - __u8 svlan_p; - __u8 svlan_cfi; - __u16 svlan_id; /* 0xffff means no svlan tag */ - - __u32 src_mac_count; /* How many MACs to iterate through */ - __u32 dst_mac_count; /* How many MACs to iterate through */ - - unsigned char dst_mac[ETH_ALEN]; - unsigned char src_mac[ETH_ALEN]; - - __u32 cur_dst_mac_offset; - __u32 cur_src_mac_offset; - __be32 cur_saddr; - __be32 cur_daddr; - __u16 cur_udp_dst; - __u16 cur_udp_src; - __u32 cur_pkt_size; - - __u8 hh[14]; - /* = { - 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, - - We fill in SRC address later - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x08, 0x00 - }; - */ - __u16 pad; /* pad out the hh struct to an even 16 bytes */ - - struct sk_buff *skb; /* skb we are to transmit next, mainly used for when we - * are transmitting the same one multiple times - */ - struct net_device *odev; /* The out-going device. Note that the device should - * have it's pg_info pointer pointing back to this - * device. This will be set when the user specifies - * the out-going device name (not when the inject is - * started as it used to do.) - */ - struct flow_state *flows; - unsigned cflows; /* Concurrent flows (config) */ - unsigned lflow; /* Flow length (config) */ - unsigned nflows; /* accumulated flows (stats) */ - unsigned curfl; /* current sequenced flow (state)*/ -#ifdef CONFIG_XFRM - __u8 ipsmode; /* IPSEC mode (config) */ - __u8 ipsproto; /* IPSEC type (config) */ -#endif - char result[512]; -}; - -struct pktgen_hdr { - __be32 pgh_magic; - __be32 seq_num; - __be32 tv_sec; - __be32 tv_usec; -}; - -struct pktgen_thread { - spinlock_t if_lock; - struct list_head if_list; /* All device here */ - struct list_head th_list; - struct task_struct *tsk; - char result[512]; - u32 max_before_softirq; /* We'll call do_softirq to prevent starvation. */ - - /* Field for thread to receive "posted" events terminate, stop ifs etc. */ - - u32 control; - int cpu; - - wait_queue_head_t queue; -}; - #define REMOVE 1 #define FIND 0 +static char* version = VERSION; + +static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove); +static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i); +static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); +static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, const char *ifname); +static int pktgen_device_event(struct notifier_block *, unsigned long, void *); +static void pktgen_run_all_threads(int background); +static void pktgen_stop_all_threads_ifs(void); +static int pktgen_stop_device(struct pktgen_dev *pkt_dev); +static void pktgen_stop(struct pktgen_thread *t); +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev, int seq_too); +static void pktgen_mark_device(const char *ifname); +static unsigned int scan_ip6(const char *s, char ip[16]); +static unsigned int fmt_ip6(char *s, const char ip[16]); +static void clear_nqw_hook(struct pktgen_thread* t, struct net_device* dev); +static int set_nqw_hook(struct pktgen_thread* t, struct net_device* dev, int gfp); + +/* Module parameters, defaults. */ +static int pg_count_d = 1000; /* 1000 pkts by default */ +static int pg_delay_d = 0x7FFFFFFF; /* Don't run until someone sets a different delay. */ + +static int pg_clone_skb_d; +static int debug; + +static DEFINE_MUTEX(pktgen_thread_lock); +static LIST_HEAD(pktgen_threads); + +static struct notifier_block pktgen_notifier_block = { + .notifier_call = pktgen_device_event, +}; + /* This code works around the fact that do_div cannot handle two 64-bit numbers, and regular 64-bit division doesn't work on x86 kernels. --Ben @@ -481,13 +305,6 @@ static inline __u64 pg_div64(__u64 n, __u64 base) return tmp; } -static inline __u64 getCurMs(void) -{ - struct timeval tv; - do_gettimeofday(&tv); - return tv_to_ms(&tv); -} - static inline __u64 getCurUs(void) { struct timeval tv; @@ -499,38 +316,18 @@ static inline __u64 tv_diff(const struct timeval *a, const struct timeval *b) { return tv_to_us(a) - tv_to_us(b); } - +/* Since the machine booted. */ +static inline __u64 getRelativeCurUs(void) { + return pg_div(sched_clock(), 1000); +} + +/* Since the machine booted. */ +static inline __u64 getRelativeCurNs(void) { + return sched_clock(); +} + /* old include end */ -static char version[] __initdata = VERSION; - -static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i); -static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); -static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, - const char *ifname); -static int pktgen_device_event(struct notifier_block *, unsigned long, void *); -static void pktgen_run_all_threads(void); -static void pktgen_stop_all_threads_ifs(void); -static int pktgen_stop_device(struct pktgen_dev *pkt_dev); -static void pktgen_stop(struct pktgen_thread *t); -static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); - -static unsigned int scan_ip6(const char *s, char ip[16]); -static unsigned int fmt_ip6(char *s, const char ip[16]); - -/* Module parameters, defaults. */ -static int pg_count_d = 1000; /* 1000 pkts by default */ -static int pg_delay_d; -static int pg_clone_skb_d; -static int debug; - -static DEFINE_MUTEX(pktgen_thread_lock); -static LIST_HEAD(pktgen_threads); - -static struct notifier_block pktgen_notifier_block = { - .notifier_call = pktgen_device_event, -}; - /* * /proc handling functions * @@ -538,7 +335,7 @@ static struct notifier_block pktgen_notifier_block = { static int pgctrl_show(struct seq_file *seq, void *v) { - seq_puts(seq, VERSION); + seq_puts(seq, version); return 0; } @@ -566,8 +363,10 @@ static ssize_t pgctrl_write(struct file *file, const char __user * buf, pktgen_stop_all_threads_ifs(); else if (!strcmp(data, "start")) - pktgen_run_all_threads(); - + pktgen_run_all_threads(0); + /* Run in the background. */ + else if (!strcmp(data, "bg_start")) + pktgen_run_all_threads(1); else printk(KERN_WARNING "pktgen: Unknown command: %s\n", data); @@ -582,6 +381,137 @@ static int pgctrl_open(struct inode *inode, struct file *file) return single_open(file, pgctrl_show, PDE(inode)->data); } +static int pg_populate_report(struct pktgen_dev_report* rpt, struct pktgen_dev* pkt_dev) { + int i; + + memset(rpt, 0, sizeof(*rpt)); + rpt->api_version = 1; + rpt->flags = pkt_dev->flags; + strncpy(rpt->thread_name, pkt_dev->pg_thread->tsk->comm, 32); + strncpy(rpt->interface_name, pkt_dev->ifname, 32); + rpt->min_pkt_size = pkt_dev->min_pkt_size; + rpt->max_pkt_size = pkt_dev->max_pkt_size; + rpt->clone_skb = pkt_dev->clone_skb; + rpt->peer_clone_skb = pkt_dev->peer_clone_skb; + rpt->nfrags = pkt_dev->nfrags; + + strncpy(rpt->dst_min, pkt_dev->dst_min, IP_NAME_SZ); + strncpy(rpt->dst_max, pkt_dev->dst_max, IP_NAME_SZ); + strncpy(rpt->src_min, pkt_dev->src_min, IP_NAME_SZ); + strncpy(rpt->src_max, pkt_dev->src_max, IP_NAME_SZ); + + memcpy(&rpt->in6_saddr, &pkt_dev->in6_saddr, sizeof(struct in6_addr)); + memcpy(&rpt->in6_daddr, &pkt_dev->in6_daddr, sizeof(struct in6_addr)); + + /* For ranges */ + memcpy(&rpt->min_in6_daddr, &pkt_dev->min_in6_daddr, sizeof(struct in6_addr)); + memcpy(&rpt->max_in6_daddr, &pkt_dev->max_in6_daddr, sizeof(struct in6_addr)); + memcpy(&rpt->min_in6_saddr, &pkt_dev->min_in6_saddr, sizeof(struct in6_addr)); + memcpy(&rpt->max_in6_saddr, &pkt_dev->max_in6_saddr, sizeof(struct in6_addr)); + + /* If we're doing ranges, random or incremental, then this + * defines the min/max for those ranges. + */ + rpt->saddr_min = pkt_dev->saddr_min; + rpt->saddr_max = pkt_dev->saddr_max; + rpt->daddr_min = pkt_dev->daddr_min; + rpt->daddr_max = pkt_dev->daddr_max; + + rpt->udp_src_min = pkt_dev->udp_src_min; + rpt->udp_src_max = pkt_dev->udp_src_max; + rpt->udp_dst_min = pkt_dev->udp_dst_min; + rpt->udp_dst_max = pkt_dev->udp_dst_max; + + /* MPLS */ + rpt->nr_labels = pkt_dev->nr_labels; /* Depth of stack, 0 = no MPLS */ + for (i = 0; ilabels[i] = pkt_dev->labels[i]; + } + + rpt->src_mac_count = pkt_dev->src_mac_count; + rpt->dst_mac_count = pkt_dev->dst_mac_count; + + memcpy(&rpt->dst_mac, &pkt_dev->dst_mac, ETH_ALEN); + memcpy(&rpt->src_mac, &pkt_dev->src_mac, ETH_ALEN); + + rpt->nflows = pkt_dev->nflows; + rpt->cflows = pkt_dev->cflows; + rpt->lflow = pkt_dev->lflow; + + rpt->delay_ns = pkt_dev->delay_ns; + rpt->count = pkt_dev->count; /* Default No packets to send */ + rpt->sofar = pkt_dev->sofar; /* How many pkts we've sent so far */ + rpt->tx_bytes = pkt_dev->tx_bytes; /* How many bytes we've transmitted */ + rpt->errors = pkt_dev->errors; /* Errors when trying to transmit, pkts will be re-sent */ + + /* Fields relating to receiving pkts */ + rpt->avg_latency = pkt_dev->avg_latency; /* in micro-seconds */ + rpt->min_latency = pkt_dev->min_latency; + rpt->max_latency = pkt_dev->max_latency; + for (i = 0; ilatency_bkts[i] = pkt_dev->latency_bkts[i]; + } + rpt->pkts_rcvd_since_clear = pkt_dev->pkts_rcvd_since_clear; + + rpt->ooo_rcvd = pkt_dev->ooo_rcvd; + rpt->pkts_rcvd = pkt_dev->pkts_rcvd; + rpt->dup_rcvd = pkt_dev->dup_rcvd; + rpt->bytes_rcvd = pkt_dev->bytes_rcvd; + rpt->seq_gap_rcvd = pkt_dev->seq_gap_rcvd; + rpt->non_pg_pkts_rcvd = pkt_dev->non_pg_pkts_rcvd; + return 0; +}; /* populate report */ + + +int pktgen_proc_ioctl(struct inode* inode, struct file* file, unsigned int cmd, + unsigned long arg) { + int err = 0; + struct pktgen_ioctl_info args; + struct pktgen_dev* pkt_dev = NULL; + + if (copy_from_user(&args, (void*)arg, sizeof(args))) { + return -EFAULT; + } + + /* Null terminate the names */ + args.thread_name[31] = 0; + args.interface_name[31] = 0; + + /* printk("pktgen: thread_name: %s interface_name: %s\n", + * args.thread_name, args.interface_name); + */ + + switch (cmd) { + case GET_PKTGEN_INTERFACE_INFO: { + mutex_lock(&pktgen_thread_lock); + pkt_dev = __pktgen_NN_threads(args.interface_name, FIND); + if (pkt_dev) { + pg_populate_report(&(args.report), pkt_dev); + if (copy_to_user((void*)(arg), &args, sizeof(args))) { + printk("ERROR: pktgen: copy_to_user failed.\n"); + err = -EFAULT; + } + else { + err = 0; + } + } + else { + printk("ERROR: pktgen: Could not find interface -:%s:-\n", + args.interface_name); + err = -ENODEV; + } + mutex_unlock(&pktgen_thread_lock); + break; + } + default: + printk("%s: Unknown pktgen IOCTL: %x \n", __FUNCTION__, + cmd); + return -EINVAL; + } + + return err; +}/* pktgen_proc_ioctl */ + static const struct file_operations pktgen_fops = { .owner = THIS_MODULE, .open = pgctrl_open, @@ -589,6 +519,7 @@ static const struct file_operations pktgen_fops = { .llseek = seq_lseek, .write = pgctrl_write, .release = single_release, + .ioctl = pktgen_proc_ioctl, }; static int pktgen_if_show(struct seq_file *seq, void *v) @@ -604,11 +535,12 @@ static int pktgen_if_show(struct seq_file *seq, void *v) (unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size, pkt_dev->max_pkt_size); - seq_printf(seq, - " frags: %d delay: %u clone_skb: %d ifname: %s\n", - pkt_dev->nfrags, - 1000 * pkt_dev->delay_us + pkt_dev->delay_ns, - pkt_dev->clone_skb, pkt_dev->odev->name); + seq_printf(seq, + " frags: %d delay: %lluns clone_skb: %d peer_clone_skb: %d ifname: %s\n", + pkt_dev->nfrags, + (unsigned long long)pkt_dev->delay_ns, + pkt_dev->clone_skb, pkt_dev->peer_clone_skb, + pkt_dev->ifname); seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows, pkt_dev->lflow); @@ -741,11 +673,32 @@ static int pktgen_if_show(struct seq_file *seq, void *v) stopped = now; /* not really stopped, more like last-running-at */ seq_printf(seq, - "Current:\n pkts-sofar: %llu errors: %llu\n started: %lluus stopped: %lluus idle: %lluus\n", + "Current:\n tx-pkts: %llu tx-errors: %llu tx-bytes: %llu\n", (unsigned long long)pkt_dev->sofar, - (unsigned long long)pkt_dev->errors, (unsigned long long)sa, + (unsigned long long)pkt_dev->errors, + (unsigned long long)pkt_dev->tx_bytes); + seq_printf(seq, + " rx-pkts: %llu rx-bytes: %llu alloc_skbs: %llu oom_alloc_skbs: %llu\n", + (unsigned long long)pkt_dev->pkts_rcvd, + (unsigned long long)pkt_dev->bytes_rcvd, + (unsigned long long)pkt_dev->allocated_skbs, + (unsigned long long)pkt_dev->oom_on_alloc_skb); + + + seq_printf(seq, + " blocked: %s next-tx-ns: %llu (%lli) started: %lluus stopped: %lluus idle: %lluns\n", + pkt_dev->tx_blocked ? "TRUE" : "false", + (unsigned long long)pkt_dev->next_tx_ns, + (long long)(pkt_dev->next_tx_ns - getRelativeCurNs()), + (unsigned long long)sa, (unsigned long long)stopped, - (unsigned long long)pkt_dev->idle_acc); + (unsigned long long)pkt_dev->idle_acc_ns); + seq_printf(seq, + " nanodelays: %llu sleeps: %llu queue_stopped: %llu tx-early: %llu\n", + (unsigned long long)pkt_dev->nanodelays, + (unsigned long long)pkt_dev->sleeps, + (unsigned long long)pkt_dev->queue_stopped, + (unsigned long long)pkt_dev->req_tx_early); seq_printf(seq, " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n", @@ -1029,15 +982,11 @@ static ssize_t pktgen_if_write(struct file *file, return len; } i += len; - if (value == 0x7FFFFFFF) { - pkt_dev->delay_us = 0x7FFFFFFF; - pkt_dev->delay_ns = 0; - } else { - pkt_dev->delay_us = value / 1000; - pkt_dev->delay_ns = value % 1000; + pkt_dev->delay_ns = value; + if ((getRelativeCurNs() + pkt_dev->delay_ns) > pkt_dev->next_tx_ns) { + pkt_dev->next_tx_ns = getRelativeCurNs() + pkt_dev->delay_ns; } - sprintf(pg_result, "OK: delay=%u", - 1000 * pkt_dev->delay_us + pkt_dev->delay_ns); + sprintf(pg_result, "OK: delay=%lluns", (unsigned long long)pkt_dev->delay_ns); return count; } if (!strcmp(name, "udp_src_min")) { @@ -1103,6 +1052,17 @@ static ssize_t pktgen_if_write(struct file *file, sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb); return count; } + if (!strcmp(name, "peer_clone_skb")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { + return len; + } + i += len; + pkt_dev->peer_clone_skb = value; + + sprintf(pg_result, "OK: peer_clone_skb=%d", pkt_dev->peer_clone_skb); + return count; + } if (!strcmp(name, "count")) { len = num_arg(&user_buffer[i], 10, &value); if (len < 0) { @@ -1224,6 +1184,7 @@ static ssize_t pktgen_if_write(struct file *file, pkt_dev->flags &= ~F_IPV6; else { + printk("pktgen: Flag -:%s:- unknown\n", f); sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", f, @@ -1490,13 +1451,12 @@ static ssize_t pktgen_if_write(struct file *file, /* Set up Src MAC */ if (compare_ether_addr(old_smac, pkt_dev->src_mac)) memcpy(&(pkt_dev->hh[6]), pkt_dev->src_mac, ETH_ALEN); - sprintf(pg_result, "OK: srcmac"); return count; } if (!strcmp(name, "clear_counters")) { - pktgen_clear_counters(pkt_dev); + pktgen_clear_counters(pkt_dev, 0); sprintf(pg_result, "OK: Clearing counters.\n"); return count; } @@ -1693,6 +1653,7 @@ static ssize_t pktgen_if_write(struct file *file, return count; } + printk("pktgen: No such parameter \"%s\"\n", name); sprintf(pkt_dev->result, "No such parameter \"%s\"", name); return -EINVAL; } @@ -1709,6 +1670,7 @@ static const struct file_operations pktgen_if_fops = { .llseek = seq_lseek, .write = pktgen_if_write, .release = single_release, + .ioctl = pktgen_proc_ioctl, }; static int pktgen_thread_show(struct seq_file *seq, void *v) @@ -1718,12 +1680,15 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) BUG_ON(!t); - seq_printf(seq, "Name: %s max_before_softirq: %d\n", - t->tsk->comm, t->max_before_softirq); + mutex_lock(&pktgen_thread_lock); + /* versioning info. CFG_RT means we do not busy-spin, so can be configured for + * real-time scheduling if user-space so desires. */ + seq_printf(seq, "VERSION-2 CFG_RT\n"); + seq_printf(seq, "PID: %d Name: %s max_before_softirq: %d\n", + t->pid, t->tsk->comm, t->max_before_softirq); seq_printf(seq, "Running: "); - if_lock(t); list_for_each_entry(pkt_dev, &t->if_list, list) if (pkt_dev->running) seq_printf(seq, "%s ", pkt_dev->odev->name); @@ -1739,8 +1704,7 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) else seq_printf(seq, "\nResult: NA\n"); - if_unlock(t); - + mutex_unlock(&pktgen_thread_lock); return 0; } @@ -1809,18 +1773,42 @@ static ssize_t pktgen_thread_write(struct file *file, return -EFAULT; i += len; mutex_lock(&pktgen_thread_lock); - pktgen_add_device(t, f); + t->control_arg = f; + t->control |= T_ADD_DEV; + while (t->control & T_ADD_DEV) { + schedule_timeout_interruptible(msecs_to_jiffies(10)); + } + t->control_arg = 0; mutex_unlock(&pktgen_thread_lock); ret = count; sprintf(pg_result, "OK: add_device=%s", f); goto out; } + if (!strcmp(name, "rem_device")) { + char f[32]; + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) { + ret = len; + goto out; + } + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; + i += len; + pktgen_mark_device(f); + ret = count; + sprintf(pg_result, "OK: rem_device=%s", f); + goto out; + } + if (!strcmp(name, "rem_device_all")) { mutex_lock(&pktgen_thread_lock); t->control |= T_REMDEVALL; mutex_unlock(&pktgen_thread_lock); - schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */ + while (t->control & T_REMDEVALL) { + schedule_timeout_interruptible(msecs_to_jiffies(10)); + } ret = count; sprintf(pg_result, "OK: rem_device_all"); goto out; @@ -1836,6 +1824,8 @@ static ssize_t pktgen_thread_write(struct file *file, goto out; } + printk("pktgen: un-known command to pktgen_thread: -:%s:-\n", name); + ret = -EINVAL; out: return ret; @@ -1853,8 +1843,10 @@ static const struct file_operations pktgen_thread_fops = { .llseek = seq_lseek, .write = pktgen_thread_write, .release = single_release, + .ioctl = pktgen_proc_ioctl, }; + /* Think find or remove for NN */ static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove) { @@ -1865,10 +1857,8 @@ static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove) pkt_dev = pktgen_find_dev(t, ifname); if (pkt_dev) { if (remove) { - if_lock(t); pkt_dev->removal_mark = 1; t->control |= T_REMDEV; - if_unlock(t); } break; } @@ -1959,31 +1949,45 @@ static int pktgen_device_event(struct notifier_block *unused, /* Associate pktgen_dev with a device. */ -static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname) +static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, struct pktgen_thread* t) { struct net_device *odev; int err; /* Clean old setups */ if (pkt_dev->odev) { +#ifdef USE_NQW_CALLBACK + /* Set the nqw callback hooks */ + rtnl_lock(); + clear_nqw_hook(t, pkt_dev->odev); + rtnl_unlock(); +#endif + pkt_dev->odev->pkt_dev = NULL; dev_put(pkt_dev->odev); pkt_dev->odev = NULL; } - odev = dev_get_by_name(ifname); + odev = dev_get_by_name(pkt_dev->ifname); if (!odev) { - printk(KERN_ERR "pktgen: no such netdevice: \"%s\"\n", ifname); + printk(KERN_ERR "pktgen: no such netdevice: \"%s\"\n", pkt_dev->ifname); return -ENODEV; } if (odev->type != ARPHRD_ETHER) { - printk(KERN_ERR "pktgen: not an ethernet device: \"%s\"\n", ifname); + printk(KERN_ERR "pktgen: not an ethernet device: \"%s\"\n", pkt_dev->ifname); err = -EINVAL; } else if (!netif_running(odev)) { - printk(KERN_ERR "pktgen: device is down: \"%s\"\n", ifname); + printk(KERN_ERR "pktgen: device is down: \"%s\"\n", pkt_dev->ifname); err = -ENETDOWN; } else { pkt_dev->odev = odev; +#ifdef USE_NQW_CALLBACK + /* Set the nqw callback hooks */ + rtnl_lock(); + set_nqw_hook(t, pkt_dev->odev, GFP_ATOMIC); + rtnl_unlock(); +#endif + pkt_dev->odev->pkt_dev = pkt_dev; return 0; } @@ -1996,6 +2000,10 @@ static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname) */ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) { + /* Try once more, just in case it works now. */ + if (!pkt_dev->odev) + pktgen_setup_dev(pkt_dev, pkt_dev->pg_thread); + if (!pkt_dev->odev) { printk(KERN_ERR "pktgen: ERROR: pkt_dev->odev == NULL in " "setup_inject.\n"); @@ -2008,6 +2016,9 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) if (is_zero_ether_addr(pkt_dev->src_mac)) memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, ETH_ALEN); + else + memcpy(&(pkt_dev->hh[6]), pkt_dev->src_mac, ETH_ALEN); + /* Set up Dest MAC */ memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN); @@ -2099,30 +2110,192 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) pkt_dev->nflows = 0; } -static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us) -{ - __u64 start; - __u64 now; - - start = now = getCurUs(); - printk(KERN_INFO "sleeping for %d\n", (int)(spin_until_us - now)); - while (now < spin_until_us) { - /* TODO: optimize sleeping behavior */ - if (spin_until_us - now > jiffies_to_usecs(1) + 1) - schedule_timeout_interruptible(1); - else if (spin_until_us - now > 100) { - do_softirq(); - if (!pkt_dev->running) - return; - if (need_resched()) - schedule(); + +#ifdef USE_NQW_CALLBACK +/* Runs from interrupt */ +int pg_notify_queue_woken(struct net_device* dev) { + /* Find the thread that needs waking. */ + struct pktgen_thread* t = ((struct pg_nqw_data*)(dev->nqw_data))->pg_thread; + t->control |= T_WAKE_BLOCKED; + wake_up_interruptible(&(t->queue)); + return 0; +} + +/* Must hold RTNL lock while calling this. */ +static int set_nqw_hook(struct pktgen_thread* t, struct net_device* dev, int gfp) { + /* The notify-queue-woken magic only works for physical + * devices at this time. So, apply hook to underlying + * device. + */ + struct pg_nqw_data* nqwd; + ASSERT_RTNL(); + BUG_ON(!t); + + if (!dev) { + WARN_ON(!dev); + return -ENODEV; + } +#if 0 +#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) + if (dev->macvlan_port) { + struct macvlan_dev *vlan = netdev_priv(dev); + printk("pktgen: setting nqw_hook on lower mac-vlan dev: %p\n", vlan->lowerdev); + return set_nqw_hook(t, vlan->lowerdev, gfp); + } +#endif +#endif + + if (dev->priv_flags & IFF_802_1Q_VLAN) { + printk("pktgen: setting nqw_hook on real-dev of .1q vlan: %s\n", dev->name); + return set_nqw_hook(t, VLAN_DEV_INFO(dev)->real_dev, gfp); + } + + nqwd = (struct pg_nqw_data*)(dev->nqw_data); + + if (nqwd) { + if (nqwd->magic == PG_NQW_MAGIC) { + if (nqwd->pg_thread == t) { + atomic_inc(&(nqwd->nqw_ref_count)); + + printk("pktgen: Incremented nqw_ref_count: %d device: %s\n", + (int)(atomic_read(&(nqwd->nqw_ref_count))), dev->name); + return 0; + } + else { + printk("pktgen: ERROR: set_nqw_hook: nqwd thread does not match, dev: %s", + dev->name); + return -EINVAL; + } + } + else { + printk("wanlink: WARNING: set_nqw_hook: nqwd magic is NOT WanLink, dev: %s magic: 0x%x", + dev->name, nqwd->magic); + return 0; + } + } + else { + nqwd = kmalloc(sizeof(*nqwd), gfp); + if (nqwd) { + memset(nqwd, 0, sizeof(*nqwd)); + nqwd->magic = PG_NQW_MAGIC; + atomic_inc(&(nqwd->nqw_ref_count)); + nqwd->pg_thread = t; + dev->nqw_data = nqwd; + dev->notify_queue_woken = pg_notify_queue_woken; + printk("pktgen: Added nqw callback to device: %s\n", + dev->name); + return 0; } + else { + printk("pktgen: ERROR: could not allocate nqwd for dev: %s\n", dev->name); + return -ENOBUFS; + } + } +}//set_nqw_hook + - now = getCurUs(); +/* Must hold RTNL lock while calling this. */ +static void clear_nqw_hook(struct pktgen_thread* t, struct net_device* dev) { + /* The notify-queue-woken magic only works for physical + * devices at this time. So, apply hook to underlying + * device. + */ + ASSERT_RTNL(); + BUG_ON(!t); + +#if 0 +#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) + if (dev->macvlan_port) { + struct macvlan_vlan *vlan = dev->priv; + clear_nqw_hook(t, vlan->lowerdev); + return; + } +#endif +#endif + + if (dev->priv_flags & IFF_802_1Q_VLAN) { + clear_nqw_hook(t, VLAN_DEV_INFO(dev)->real_dev); + return; } - pkt_dev->idle_acc += now - start; -} + if (dev->nqw_data) { + struct pg_nqw_data* nqwd = (struct pg_nqw_data*)(dev->nqw_data); + if (nqwd->magic == PG_NQW_MAGIC) { + if (t != nqwd->pg_thread) { + printk("pktgen ERROR: t != nqwd->pg_thread\n"); + } + atomic_dec(&(nqwd->nqw_ref_count)); + + printk("pktgen: Decremented nqw_ref_count: %d device: %s\n", + (int)(atomic_read(&(nqwd->nqw_ref_count))), + dev->name); + + BUG_ON(atomic_read(&(nqwd->nqw_ref_count)) < 0); + + if (atomic_read(&(nqwd->nqw_ref_count)) == 0) { + printk("pktgen: Removing nqw reference from device: %s\n", + dev->name); + dev->notify_queue_woken = NULL; + dev->nqw_data = NULL; + kfree(nqwd); + } + } + else { + printk("pktgen: WARNING: clear_nqw_hook: nqwd magic is NOT PKT-GEN, dev: %s magic: 0x%x", + dev->name, nqwd->magic); + } + } + else { + printk("pktgen: Warning: nqw_data is null in clear_nqw_hook, dev: %s\n", + dev->name); + } +}//clear_nqw_hook + +#endif + + +/* delay_ns is in nano-seconds */ +static void pg_nanodelay(u64 delay_ns, struct pktgen_dev* info) { + u64 idle_start = getRelativeCurNs(); + u64 last_time; + u64 _diff; + u64 itmp = idle_start; + struct pktgen_dev *p = NULL; + struct pktgen_thread* t = info->pg_thread; + + info->nanodelays++; + info->accum_delay_ns += delay_ns; + while (info->accum_delay_ns > PG_MAX_ACCUM_DELAY_NS) { + info->sleeps++; + interruptible_sleep_on_timeout(&(t->queue), 1); + /* will wake after one tick */ + last_time = itmp; + + /* Subtract delay from all interfaces for this thread, since all are blocked when + * any are blocked. + */ + itmp = getRelativeCurNs(); + _diff = (itmp - last_time); + list_for_each_entry(p, &t->if_list, list) { + p->accum_delay_ns -= _diff; + /* Limit saving up too much time... */ + if (p->accum_delay_ns < -10000000) { + p->accum_delay_ns = -10000000; + } + } + + /* For accounting, only charge this guy for the idle though...*/ + info->idle_acc_ns += _diff; + + /* break out if we are stopped or if we should transmit (maybe our ipg changed?) */ + if (info->removal_mark || (itmp >= info->next_tx_ns) || + (t->control && T_WAKE_BLOCKED) || + (t->control && T_STOP)) { + break; + } + }/* while */ +}//pg_nanodelay + static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) { @@ -2508,7 +2681,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */ __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ - + int cur_pkt_size; if (pkt_dev->nr_labels) protocol = htons(ETH_P_MPLS_UC); @@ -2522,12 +2695,14 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, mod_cur_headers(pkt_dev); datalen = (odev->hard_header_len + 16) & ~0xf; - skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + datalen + + cur_pkt_size = pkt_dev->cur_pkt_size; /* protect against race */ + skb = alloc_skb(cur_pkt_size + 64 + datalen + pkt_dev->pkt_overhead, GFP_ATOMIC); if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; } + pkt_dev->seq_num++; /* Increase the pktgen sequence number for the next packet. */ skb_reserve(skb, datalen); @@ -2565,7 +2740,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, *(__be16 *) & eth[12] = protocol; /* Eth + IPh + UDPh + mpls */ - datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - + datalen = cur_pkt_size - 14 - 20 - 8 - pkt_dev->pkt_overhead; if (datalen < sizeof(struct pktgen_hdr)) datalen = sizeof(struct pktgen_hdr); @@ -2851,7 +3026,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */ __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ - + int cur_pkt_size; + if (pkt_dev->nr_labels) protocol = htons(ETH_P_MPLS_UC); @@ -2863,7 +3039,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, */ mod_cur_headers(pkt_dev); - skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16 + + cur_pkt_size = pkt_dev->cur_pkt_size; + skb = alloc_skb(cur_pkt_size + 64 + 16 + pkt_dev->pkt_overhead, GFP_ATOMIC); if (!skb) { sprintf(pkt_dev->result, "No memory"); @@ -2906,7 +3083,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, *(__be16 *) & eth[12] = protocol; /* Eth + IPh + UDPh + mpls */ - datalen = pkt_dev->cur_pkt_size - 14 - + datalen = cur_pkt_size - 14 - sizeof(struct ipv6hdr) - sizeof(struct udphdr) - pkt_dev->pkt_overhead; @@ -3009,7 +3186,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, pgh->tv_sec = htonl(timestamp.tv_sec); pgh->tv_usec = htonl(timestamp.tv_usec); } - /* pkt_dev->seq_num++; FF: you really mean this? */ + pkt_dev->seq_num++; /* Increase the pktgen seq number for the next packet. */ return skb; } @@ -3023,13 +3200,206 @@ static inline struct sk_buff *fill_packet(struct net_device *odev, return fill_packet_ipv4(odev, pkt_dev); } -static void pktgen_clear_counters(struct pktgen_dev *pkt_dev) -{ - pkt_dev->seq_num = 1; - pkt_dev->idle_acc = 0; + +static void record_latency(struct pktgen_dev* pkt_dev, int latency) { + /* NOTE: Latency can be negative */ + int div = 100; + int diff; + int vl; + int i; + + pkt_dev->pkts_rcvd_since_clear++; + + if (pkt_dev->pkts_rcvd_since_clear < 100) { + div = pkt_dev->pkts_rcvd; + if (pkt_dev->pkts_rcvd_since_clear == 1) { + pkt_dev->avg_latency = latency; + } + } + + if ((div + 1) == 0) { + pkt_dev->avg_latency = 0; + } + else { + pkt_dev->avg_latency = ((pkt_dev->avg_latency * div + latency) / (div + 1)); + } + + if (latency < pkt_dev->min_latency) { + pkt_dev->min_latency = latency; + } + if (latency > pkt_dev->max_latency) { + pkt_dev->max_latency = latency; + } + + /* Place the latency in the right 'bucket' */ + diff = (latency - pkt_dev->min_latency); + for (i = 0; ilatency_bkts[i]++; + break; + } + } +}/* record latency */ + + +/* Returns < 0 if the skb is not a pktgen buffer. */ +int pktgen_receive(struct sk_buff* skb) { + /* See if we have a pktgen packet */ + /* TODO: Add support for detecting IPv6, TCP packets too. This will only + * catch UDP at the moment. --Ben + */ + /* printk("pktgen-rcv, skb->len: %d\n", skb->len); */ + + /* If this is a paged skb, make sure we pull up + * whatever data we need to look at. */ + if (!pskb_may_pull(skb, 20 + 8 + sizeof(struct pktgen_hdr))) { + return -1; + } + + if ((skb->len >= (20 + 8 + sizeof(struct pktgen_hdr))) && + (skb->protocol == __constant_htons(ETH_P_IP))) { + struct pktgen_hdr* pgh; + + /* It's IP, and long enough, lets check the magic number. + * TODO: This is a hack not always guaranteed to catch the right + * packets. + */ + + /* printk("Length & protocol passed, skb->data: %p, raw: %p\n", + skb->data, skb->h.raw); */ + + pgh = (struct pktgen_hdr*)(skb->data + 20 + 8); + + /* + tmp = (char*)(skb->data); + for (i = 0; i<90; i++) { + printk("%02hx ", tmp[i]); + if (((i + 1) % 15) == 0) { + printk("\n"); + } + } + printk("\n"); + */ + + if (pgh->pgh_magic == __constant_ntohl(PKTGEN_MAGIC)) { + struct net_device* dev = skb->dev; + struct pktgen_dev* pkt_dev; + __u32 seq = ntohl(pgh->seq_num); + + // TODO: Need lock..maybe + pkt_dev = dev->pkt_dev; + + if (!pkt_dev) { + return -1; + } + + pkt_dev->pkts_rcvd++; + pkt_dev->bytes_rcvd += skb->len; + + /* Check for out-of-sequence packets */ + if (pkt_dev->last_seq_rcvd == seq) { + pkt_dev->dup_rcvd++; + pkt_dev->dup_since_incr++; + } + else { + __s64 rx; + __s64 tx; + struct timeval txtv; + if (! skb->tstamp.tv64) { + __net_timestamp(skb); + } + skb_get_timestamp(skb, &txtv); + rx = tv_to_us(&txtv); + + txtv.tv_usec = ntohl(pgh->tv_usec); + txtv.tv_sec = ntohl(pgh->tv_sec); + tx = tv_to_us(&txtv); + record_latency(pkt_dev, rx - tx); + + if ((pkt_dev->last_seq_rcvd + 1) == seq) { + if ((pkt_dev->peer_clone_skb > 1) && + (pkt_dev->peer_clone_skb > (pkt_dev->dup_since_incr + 1))) { + + pkt_dev->seq_gap_rcvd += (pkt_dev->peer_clone_skb - + pkt_dev->dup_since_incr - 1); + } + /* Great, in order...all is well */ + } + else if (pkt_dev->last_seq_rcvd < seq) { + /* sequence gap, means we dropped a pkt most likely */ + if (pkt_dev->peer_clone_skb > 1) { + /* We dropped more than one sequence number's worth, + * and if we're using clone_skb, then this is quite + * a few. This number still will not be exact, but + * it will be closer. + */ + pkt_dev->seq_gap_rcvd += (((seq - pkt_dev->last_seq_rcvd) * + pkt_dev->peer_clone_skb) - + pkt_dev->dup_since_incr); + } + else { + pkt_dev->seq_gap_rcvd += (seq - pkt_dev->last_seq_rcvd - 1); + } + } + else { + pkt_dev->ooo_rcvd++; /* out-of-order */ + } + + pkt_dev->dup_since_incr = 0; + } + pkt_dev->last_seq_rcvd = seq; + kfree_skb(skb); + if (debug > 1) { + printk("done with pktgen_receive, free'd pkt\n"); + } + return 0; + } + } + return -1; /* Let another protocol handle it, it's not for us! */ +}/* pktgen_receive */ + +static void pg_reset_latency_counters(struct pktgen_dev* pkt_dev) { + int i; + pkt_dev->avg_latency = 0; + pkt_dev->min_latency = 0x7fffffff; /* largest integer */ + pkt_dev->max_latency = 0x80000000; /* smallest integer */ + pkt_dev->pkts_rcvd_since_clear = 0; + for (i = 0; ilatency_bkts[i] = 0; + } +} + + +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev, int seq_too) { + pkt_dev->idle_acc_ns = 0; pkt_dev->sofar = 0; pkt_dev->tx_bytes = 0; pkt_dev->errors = 0; + pkt_dev->pkts_rcvd_since_clear = 0; + + pkt_dev->ooo_rcvd = 0; + pkt_dev->dup_rcvd = 0; + pkt_dev->pkts_rcvd = 0; + pkt_dev->bytes_rcvd = 0; + pkt_dev->non_pg_pkts_rcvd = 0; + pkt_dev->seq_gap_rcvd = 0; /* dropped */ + + /* Clear some transient state */ + pkt_dev->accum_delay_ns = 0; + pkt_dev->sleeps = 0; + pkt_dev->nanodelays = 0; + + /* This is a bit of a hack, but it gets the dup counters + * in line so we don't have false alarms on dropped pkts. + */ + if (seq_too) { + pkt_dev->dup_since_incr = pkt_dev->peer_clone_skb - 1; + pkt_dev->seq_num = 0; + pkt_dev->last_seq_rcvd = 0; + } + + pg_reset_latency_counters(pkt_dev); } /* Set up structure for sending pkts, clear counters */ @@ -3041,31 +3411,31 @@ static void pktgen_run(struct pktgen_thread *t) pr_debug("pktgen: entering pktgen_run. %p\n", t); - if_lock(t); list_for_each_entry(pkt_dev, &t->if_list, list) { + /* If already running, then ignore. */ + if (! pkt_dev->running) { + + /** Clear counters before we setup the first inject. */ + pktgen_clear_counters(pkt_dev, 1); - /* - * setup odev and create initial packet. - */ - pktgen_setup_inject(pkt_dev); - - if (pkt_dev->odev) { - pktgen_clear_counters(pkt_dev); - pkt_dev->running = 1; /* Cranke yeself! */ - pkt_dev->skb = NULL; - pkt_dev->started_at = getCurUs(); - pkt_dev->next_tx_us = getCurUs(); /* Transmit immediately */ - pkt_dev->next_tx_ns = 0; - set_pkt_overhead(pkt_dev); - - strcpy(pkt_dev->result, "Starting"); - started++; - } else - strcpy(pkt_dev->result, "Error starting"); + /* + * setup odev and create initial packet. + */ + pktgen_setup_inject(pkt_dev); + + if (pkt_dev->odev) { + pkt_dev->running = 1; /* Cranke yeself! */ + pkt_dev->skb = NULL; + pkt_dev->started_at = getCurUs(); + /* Transmit first pkt after 20ms to let listener get started. */ + pkt_dev->next_tx_ns = getRelativeCurNs() + 20 * 1000000; + + strcpy(pkt_dev->result, "Starting"); + started++; + } else + strcpy(pkt_dev->result, "Error starting"); + } } - if_unlock(t); - if (started) - t->control &= ~(T_STOP); } static void pktgen_stop_all_threads_ifs(void) @@ -3081,66 +3451,11 @@ static void pktgen_stop_all_threads_ifs(void) mutex_unlock(&pktgen_thread_lock); } - -static int thread_is_running(struct pktgen_thread *t) -{ - struct pktgen_dev *pkt_dev; - int res = 0; - - list_for_each_entry(pkt_dev, &t->if_list, list) - if (pkt_dev->running) { - res = 1; - break; - } - return res; -} - -static int pktgen_wait_thread_run(struct pktgen_thread *t) -{ - if_lock(t); - - while (thread_is_running(t)) { - - if_unlock(t); - - msleep_interruptible(100); - - if (signal_pending(current)) - goto signal; - if_lock(t); - } - if_unlock(t); - return 1; -signal: - return 0; -} - -static int pktgen_wait_all_threads_run(void) -{ - struct pktgen_thread *t; - int sig = 1; - - mutex_lock(&pktgen_thread_lock); - - list_for_each_entry(t, &pktgen_threads, th_list) { - sig = pktgen_wait_thread_run(t); - if (sig == 0) - break; - } - - if (sig == 0) - list_for_each_entry(t, &pktgen_threads, th_list) - t->control |= (T_STOP); - - mutex_unlock(&pktgen_thread_lock); - return sig; -} - -static void pktgen_run_all_threads(void) -{ +static void pktgen_run_all_threads(int background) { struct pktgen_thread *t; - pr_debug("pktgen: entering pktgen_run_all_threads.\n"); + pr_debug("pktgen: entering pktgen_run_all_threads, background: %d\n", + background); mutex_lock(&pktgen_thread_lock); @@ -3149,9 +3464,14 @@ static void pktgen_run_all_threads(void) mutex_unlock(&pktgen_thread_lock); - schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */ + /* This is a hack at best...disabling, we should not have to depend on this. */ + /*schedule_timeout_interruptible(msecs_to_jiffies(125));*/ /* Propagate thread->control */ - pktgen_wait_all_threads_run(); + // Much harder to get rid of the if_lock if we allow this to block... + if (!background) { + printk("ERROR: non-background mode no longer supported.\n"); + //pktgen_wait_all_threads_run(); + } } static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) @@ -3161,7 +3481,7 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) total_us = pkt_dev->stopped_at - pkt_dev->started_at; - idle = pkt_dev->idle_acc; + idle = do_div(pkt_dev->idle_acc_ns, 1000); p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n", (unsigned long long)total_us, @@ -3210,22 +3530,62 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev) return 0; } -static struct pktgen_dev *next_to_run(struct pktgen_thread *t) -{ - struct pktgen_dev *pkt_dev, *best = NULL; - - if_lock(t); +/** Find the adapter that needs to tx next. + * We need to take the blocked adapters into account, but can't ignore + * them forever just in case we missed the tx-queue-wake event for some + * reason. + */ +static struct pktgen_dev *next_to_run(struct pktgen_thread *t, u64 now, u64* next_running_delay) { + struct pktgen_dev *pkt_dev = NULL; + struct pktgen_dev *best = NULL; + struct pktgen_dev *best_blocked = NULL; list_for_each_entry(pkt_dev, &t->if_list, list) { if (!pkt_dev->running) continue; - if (best == NULL) - best = pkt_dev; - else if (pkt_dev->next_tx_us < best->next_tx_us) - best = pkt_dev; + if (pkt_dev->tx_blocked) { + if (best_blocked == NULL) + best_blocked = pkt_dev; + else { + if (pkt_dev->next_tx_ns < best_blocked->next_tx_ns) { + best_blocked = pkt_dev; + } + } + } + else { + if (best == NULL) + best = pkt_dev; + else { + if (pkt_dev->next_tx_ns < best->next_tx_ns) { + best = pkt_dev; + } + } + } + } + + /** If we have a blocked device that is more than 1ms late, then try it again first. + * Otherwise, take best non-blocked device. + */ + if (best) { + if (best->next_tx_ns <= now) { + *next_running_delay = 0; + } + else { + *next_running_delay = best->next_tx_ns - now; + } + } + else { + *next_running_delay = 10000000; /* 10ms */ + } + + if (best_blocked && (best_blocked->next_tx_ns < (now - PG_TRY_TX_ANYWAY_NS))) { + return best_blocked; } - if_unlock(t); - return best; + + if (best) { + return best; + } + return best_blocked; } static void pktgen_stop(struct pktgen_thread *t) @@ -3234,8 +3594,6 @@ static void pktgen_stop(struct pktgen_thread *t) pr_debug("pktgen: entering pktgen_stop\n"); - if_lock(t); - list_for_each_entry(pkt_dev, &t->if_list, list) { pktgen_stop_device(pkt_dev); if (pkt_dev->skb) @@ -3243,8 +3601,6 @@ static void pktgen_stop(struct pktgen_thread *t) pkt_dev->skb = NULL; } - - if_unlock(t); } /* @@ -3258,8 +3614,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) pr_debug("pktgen: entering pktgen_rem_one_if\n"); - if_lock(t); - list_for_each_safe(q, n, &t->if_list) { cur = list_entry(q, struct pktgen_dev, list); @@ -3274,10 +3628,15 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) break; } - - if_unlock(t); } +static void pktgen_unblock_all_ifs(struct pktgen_thread *t) { + struct pktgen_dev *p = NULL;; + list_for_each_entry(p, &t->if_list, list) + p->tx_blocked = 0; +}/* wake all writers */ + + static void pktgen_rem_all_ifs(struct pktgen_thread *t) { struct list_head *q, *n; @@ -3286,8 +3645,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) /* Remove all devices, free mem */ pr_debug("pktgen: entering pktgen_rem_all_ifs\n"); - if_lock(t); - list_for_each_safe(q, n, &t->if_list) { cur = list_entry(q, struct pktgen_dev, list); @@ -3297,8 +3654,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) pktgen_remove_device(t, cur); } - - if_unlock(t); } static void pktgen_rem_thread(struct pktgen_thread *t) @@ -3314,27 +3669,26 @@ static void pktgen_rem_thread(struct pktgen_thread *t) mutex_unlock(&pktgen_thread_lock); } -static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) +static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev, u64 now) { - struct net_device *odev = NULL; + struct net_device *odev; __u64 idle_start = 0; int ret; odev = pkt_dev->odev; - if (pkt_dev->delay_us || pkt_dev->delay_ns) { - u64 now; - - now = getCurUs(); - if (now < pkt_dev->next_tx_us) - spin(pkt_dev, pkt_dev->next_tx_us); + if (pkt_dev->delay_ns || (pkt_dev->accum_delay_ns > 0)) { + if (now < pkt_dev->next_tx_ns) { + /* Don't tx early..*/ + pkt_dev->req_tx_early++; + goto out; + } /* This is max DELAY, this has special meaning of * "never transmit" */ - if (pkt_dev->delay_us == 0x7FFFFFFF) { - pkt_dev->next_tx_us = getCurUs() + pkt_dev->delay_us; - pkt_dev->next_tx_ns = pkt_dev->delay_ns; + if (pkt_dev->delay_ns == 0x7FFFFFFF) { + pkt_dev->next_tx_ns = getRelativeCurNs() + pkt_dev->delay_ns; goto out; } } @@ -3343,26 +3697,17 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) (pkt_dev->skb && netif_subqueue_stopped(odev, pkt_dev->skb->queue_mapping))) || need_resched()) { - idle_start = getCurUs(); - + pkt_dev->queue_stopped++; + pkt_dev->tx_blocked = 1; + /* change tx time to now to show work was at least attempted. */ + pkt_dev->next_tx_ns = now; if (!netif_running(odev)) { pktgen_stop_device(pkt_dev); if (pkt_dev->skb) kfree_skb(pkt_dev->skb); pkt_dev->skb = NULL; - goto out; - } - if (need_resched()) - schedule(); - - pkt_dev->idle_acc += getCurUs() - idle_start; - - if (netif_queue_stopped(odev) || - netif_subqueue_stopped(odev, pkt_dev->skb->queue_mapping)) { - pkt_dev->next_tx_us = getCurUs(); /* TODO */ - pkt_dev->next_tx_ns = 0; - goto out; /* Try the next interface */ } + goto out; /* try next interface */ } if (pkt_dev->last_ok || !pkt_dev->skb) { @@ -3374,10 +3719,11 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->skb = fill_packet(odev, pkt_dev); if (pkt_dev->skb == NULL) { - printk(KERN_ERR "pktgen: ERROR: couldn't " - "allocate skb in fill_packet.\n"); + //printk(KERN_ERR "pktgen: ERROR: couldn't " + // "allocate skb in fill_packet.\n"); schedule(); pkt_dev->clone_count--; /* back out increment, OOM */ + pkt_dev->oom_on_alloc_skb++; goto out; } pkt_dev->allocated_skbs++; @@ -3395,40 +3741,44 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) if (likely(ret == NETDEV_TX_OK)) { pkt_dev->last_ok = 1; pkt_dev->sofar++; - pkt_dev->seq_num++; pkt_dev->tx_bytes += pkt_dev->cur_pkt_size; + pkt_dev->next_tx_ns = getRelativeCurNs() + pkt_dev->delay_ns; + pkt_dev->tx_blocked = 0; } else if (ret == NETDEV_TX_LOCKED && (odev->features & NETIF_F_LLTX)) { cpu_relax(); goto retry_now; } else { /* Retry it next time */ - + static int do_once_hsx_wrn = 1; + if (do_once_hsx_wrn) { + printk(KERN_INFO "pktgen: Hard xmit error, driver for %s doesn't do queue-stopped quite right.\n", odev->name); + printk(KERN_INFO "pktgen: Transmit request will be retried, and this error msg will not be printed again..\n"); + do_once_hsx_wrn = 0; + } + atomic_dec(&(pkt_dev->skb->users)); - if (debug && net_ratelimit()) - printk(KERN_INFO "pktgen: Hard xmit error\n"); - + pkt_dev->queue_stopped++; pkt_dev->errors++; pkt_dev->last_ok = 0; - } - - pkt_dev->next_tx_us = getCurUs(); - pkt_dev->next_tx_ns = 0; - - pkt_dev->next_tx_us += pkt_dev->delay_us; - pkt_dev->next_tx_ns += pkt_dev->delay_ns; - if (pkt_dev->next_tx_ns > 1000) { - pkt_dev->next_tx_us++; - pkt_dev->next_tx_ns -= 1000; + /* Try a little later..flag us as wanting to tx, but unable. Will try again shortly. + */ + pkt_dev->tx_blocked = 1; + /* change tx time to now to show work was at least attempted. */ + pkt_dev->next_tx_ns = now; } } else { /* Retry it next time */ + pkt_dev->queue_stopped++; pkt_dev->last_ok = 0; - pkt_dev->next_tx_us = getCurUs(); /* TODO */ - pkt_dev->next_tx_ns = 0; + /* Try a little later..flag us as wanting to tx, but unable. Will try again shortly. + */ + pkt_dev->tx_blocked = 1; + /* change tx time to now to show work was at least attempted. */ + pkt_dev->next_tx_ns = now; } netif_tx_unlock_bh(odev); @@ -3436,14 +3786,14 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) /* If pkt_dev->count is zero, then run forever */ if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { if (atomic_read(&(pkt_dev->skb->users)) != 1) { - idle_start = getCurUs(); + idle_start = getRelativeCurNs(); while (atomic_read(&(pkt_dev->skb->users)) != 1) { if (signal_pending(current)) { break; } schedule(); } - pkt_dev->idle_acc += getCurUs() - idle_start; + pkt_dev->idle_acc_ns += getRelativeCurNs() - idle_start; } /* Done with this */ @@ -3467,7 +3817,9 @@ static int pktgen_thread_worker(void *arg) int cpu = t->cpu; u32 max_before_softirq; u32 tx_since_softirq = 0; - + u64 now; + u64 next_running_delay; + BUG_ON(smp_processor_id() != cpu); init_waitqueue_head(&t->queue); @@ -3481,7 +3833,15 @@ static int pktgen_thread_worker(void *arg) set_freezable(); while (!kthread_should_stop()) { - pkt_dev = next_to_run(t); + find_best: + + if (t->control & T_WAKE_BLOCKED) { + pktgen_unblock_all_ifs(t); + t->control &= ~(T_WAKE_BLOCKED); + } + + now = getRelativeCurNs(); + pkt_dev = next_to_run(t, now, &next_running_delay); if (!pkt_dev && (t->control & (T_STOP | T_RUN | T_REMDEVALL | T_REMDEV)) @@ -3495,8 +3855,40 @@ static int pktgen_thread_worker(void *arg) __set_current_state(TASK_RUNNING); if (pkt_dev) { - - pktgen_xmit(pkt_dev); + if (pkt_dev->tx_blocked) { + /* If blocked for less than 1ms, then sleep for up to 1ms. If the + * device un-blocks, then we will be woken by the wait-queue callback. + */ + u64 tx_anyway_ns = (now - PG_TRY_TX_ANYWAY_NS); + if (pkt_dev->next_tx_ns > tx_anyway_ns) { + pg_nanodelay(min(next_running_delay, (u64)(PG_TRY_TX_ANYWAY_NS)), + pkt_dev); + /* Maybe things have changed since we went to sleep. */ + goto find_best; + } + } + + /* If the best to run should not run yet, then sleep (or accumulate sleep) */ + if (now < pkt_dev->next_tx_ns) { + /* spin(pkt_dev, pkt_dev->next_tx_us); */ + u64 next_ipg = pkt_dev->next_tx_ns - now; + + /* These will not actually busy-spin now. Will run as + * much as 1ms fast, and will sleep in 1ms units, assuming + * our tick is 1ms. + */ + pg_nanodelay(next_ipg, pkt_dev); + now = getRelativeCurNs(); + if (pkt_dev->removal_mark || + (pkt_dev->pg_thread->control && T_STOP)) { + goto skip_tx; + } + } + + + pktgen_xmit(pkt_dev, now); + + skip_tx: /* * We like to stay RUNNING but must also give @@ -3522,6 +3914,11 @@ static int pktgen_thread_worker(void *arg) t->control &= ~(T_RUN); } + if (t->control & T_ADD_DEV) { + pktgen_add_device(t, (char*)(t->control_arg)); + t->control &= ~(T_ADD_DEV); + } + if (t->control & T_REMDEVALL) { pktgen_rem_all_ifs(t); t->control &= ~(T_REMDEVALL); @@ -3553,16 +3950,11 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, const char *ifname) { struct pktgen_dev *p, *pkt_dev = NULL; - if_lock(t); - list_for_each_entry(p, &t->if_list, list) if (strncmp(p->odev->name, ifname, IFNAMSIZ) == 0) { pkt_dev = p; break; } - - if_unlock(t); - pr_debug("pktgen: find_dev(%s) returning %p\n", ifname, pkt_dev); return pkt_dev; } @@ -3575,8 +3967,6 @@ static int add_dev_to_thread(struct pktgen_thread *t, { int rv = 0; - if_lock(t); - if (pkt_dev->pg_thread) { printk(KERN_ERR "pktgen: ERROR: already assigned " "to a thread.\n"); @@ -3589,12 +3979,9 @@ static int add_dev_to_thread(struct pktgen_thread *t, pkt_dev->running = 0; out: - if_unlock(t); return rv; } -/* Called under thread lock */ - static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) { struct pktgen_dev *pkt_dev; @@ -3607,7 +3994,10 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) printk(KERN_ERR "pktgen: ERROR: interface already used.\n"); return -EBUSY; } - + else { + printk("pktgen: Attempting to add device: %s\n", ifname); + } + pkt_dev = kzalloc(sizeof(struct pktgen_dev), GFP_KERNEL); if (!pkt_dev) return -ENOMEM; @@ -3624,8 +4014,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->max_pkt_size = ETH_ZLEN; pkt_dev->nfrags = 0; pkt_dev->clone_skb = pg_clone_skb_d; - pkt_dev->delay_us = pg_delay_d / 1000; - pkt_dev->delay_ns = pg_delay_d % 1000; + pkt_dev->delay_ns = pg_delay_d; pkt_dev->count = pg_count_d; pkt_dev->sofar = 0; pkt_dev->udp_src_min = 9; /* sink port */ @@ -3639,8 +4028,9 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->svlan_p = 0; pkt_dev->svlan_cfi = 0; pkt_dev->svlan_id = 0xffff; - - err = pktgen_setup_dev(pkt_dev, ifname); + strncpy(pkt_dev->ifname, ifname, sizeof(pkt_dev->ifname)); + + err = pktgen_setup_dev(pkt_dev, t); if (err) goto out1; @@ -3684,7 +4074,6 @@ static int __init pktgen_create_thread(int cpu) return -ENOMEM; } - spin_lock_init(&t->if_lock); t->cpu = cpu; INIT_LIST_HEAD(&t->if_list); @@ -3751,6 +4140,14 @@ static int pktgen_remove_device(struct pktgen_thread *t, /* Dis-associate from the interface */ if (pkt_dev->odev) { + +#ifdef USE_NQW_CALLBACK + /* Set the nqw callback hooks */ + rtnl_lock(); + clear_nqw_hook(t, pkt_dev->odev); + rtnl_unlock(); +#endif + pkt_dev->odev->pkt_dev = NULL; dev_put(pkt_dev->odev); pkt_dev->odev = NULL; } @@ -3777,6 +4174,15 @@ static int __init pg_init(void) struct proc_dir_entry *pe; printk(KERN_INFO "%s", version); + printk(KERN_INFO "sizeof report: %d, sizeof in6_addr: %d\n", + (int)(sizeof(struct pktgen_dev_report)), + (int)(sizeof(struct in6_addr))); + + if (handle_pktgen_hook) { + printk(KERN_ERR "pktgen: ERROR: pktgen is already loaded it seems..\n"); + /* Already loaded */ + return -EEXIST; + } pg_proc_dir = proc_mkdir(PG_PROC_DIR, proc_net); if (!pg_proc_dir) @@ -3815,6 +4221,9 @@ static int __init pg_init(void) return -ENODEV; } + handle_pktgen_hook = pktgen_receive; + pr_debug("pktgen initialization complete.\n"); + return 0; } @@ -3837,8 +4246,11 @@ static void __exit pg_cleanup(void) unregister_netdevice_notifier(&pktgen_notifier_block); /* Clean up proc file system */ + pr_debug("pktgen: removing proc entry: %s (0x%p)\n", PGCTRL, pg_proc_dir); remove_proc_entry(PGCTRL, pg_proc_dir); proc_net_remove(PG_PROC_DIR); + + handle_pktgen_hook = NULL; } module_init(pg_init); diff --git a/net/core/pktgen.h b/net/core/pktgen.h new file mode 100644 index 0000000..90e4379 --- /dev/null +++ b/net/core/pktgen.h @@ -0,0 +1,378 @@ +/* -*-linux-c-*- + * $Id: candela_2.6.13.patch,v 1.3 2005/09/30 04:45:31 greear Exp $ + * pktgen.c: Packet Generator for performance evaluation. + * + * See pktgen.c for details of changes, etc. +*/ + + +#ifndef PKTGEN_H_INCLUDE_KERNEL__ +#define PKTGEN_H_INCLUDE_KERNEL__ + +#include +#include + +/* The buckets are exponential in 'width' */ +#define LAT_BUCKETS_MAX 32 +#define PG_MAX_ACCUM_DELAY_NS 1000000 /* one ms */ +#define PG_TRY_TX_ANYWAY_NS 1000000 /* try a blocked tx queue after 1 ms. */ + +#define IP_NAME_SZ 32 +#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ +#define MPLS_STACK_BOTTOM __constant_htonl(0x00000100) + +/* Device flag bits */ +#define F_IPSRC_RND (1<<0) /* IP-Src Random */ +#define F_IPDST_RND (1<<1) /* IP-Dst Random */ +#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */ +#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */ +#define F_MACSRC_RND (1<<4) /* MAC-Src Random */ +#define F_MACDST_RND (1<<5) /* MAC-Dst Random */ +#define F_TXSIZE_RND (1<<6) /* Transmit packet size is random */ +#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */ +#define F_MPLS_RND (1<<8) /* Random MPLS labels */ +#define F_VID_RND (1<<9) /* Random VLAN ID */ +#define F_SVID_RND (1<<10) /* Random SVLAN ID */ +#define F_FLOW_SEQ (1<<11) /* Sequential flows */ +#define F_IPSEC_ON (1<<12) /* ipsec on for flows */ + +/* Thread control flag bits */ +#define T_TERMINATE (1<<0) +#define T_STOP (1<<1) /* Stop run */ +#define T_RUN (1<<2) /* Start run */ +#define T_REMDEVALL (1<<3) /* Remove all devs */ +#define T_REMDEV (1<<4) /* Remove one dev */ +#define T_WAKE_BLOCKED (1<<5) /* Wake up all blocked net-devices. */ +#define T_ADD_DEV (1<<6) /* Add a device. */ + +/* Used to help with determining the pkts on receive */ +#define PKTGEN_MAGIC 0xbe9be955 +#define PG_PROC_DIR "pktgen" +#define PGCTRL "pgctrl" + +#define MAX_CFLOWS 65536 + +#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4) +#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4) + +struct flow_state { + __be32 cur_daddr; + int count; +#ifdef CONFIG_XFRM + struct xfrm_state *x; +#endif + __u32 flags; +}; + +/* flow flag bits */ +#define F_INIT (1<<0) /* flow has been initialized */ + +struct pktgen_dev { + + /* + * Try to keep frequent/infrequent used vars. separated. + */ + char ifname[IFNAMSIZ]; + char result[512]; + + struct proc_dir_entry *entry; /* proc file */ + struct pktgen_thread *pg_thread; /* the owner */ + struct list_head list; /* Used for chaining in the thread's run-queue */ + + int running; /* if this changes to false, the test will stop */ + + /* If min != max, then we will either do a linear iteration, or + * we will do a random selection from within the range. + */ + __u32 flags; + int removal_mark; /* non-zero => the device is marked for + * removal by worker thread */ + + __u32 min_pkt_size; /* = ETH_ZLEN; */ + __u32 max_pkt_size; /* = ETH_ZLEN; */ + int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ + __u32 nfrags; + __u64 delay_ns; /* Delay this much between sending packets. */ + __u64 count; /* Default No packets to send */ + __u64 sofar; /* How many pkts we've sent so far */ + __u64 tx_bytes; /* How many bytes we've transmitted */ + __u64 errors; /* Errors when trying to transmit, pkts will be re-sent */ + __u64 nanodelays; /* how many times have we called nano-delay on this device? */ + __s64 accum_delay_ns; /* Accumulated delay..when >= 1ms, we'll sleep on a wait queue. */ + __u64 sleeps; /* How many times have we gone to sleep on the wait queue. */ + __u64 queue_stopped; /* How many times was queue stopped when we tried to xmit? */ + /* runtime counters relating to clone_skb */ + __u64 next_tx_ns; /* timestamp of when to tx next */ + __u64 req_tx_early; /* requested to tx, but is too early for us to tx. */ + + __u64 oom_on_alloc_skb; + __u64 allocated_skbs; + __u32 clone_count; + + int tx_blocked; /* Need to tx as soon as able... */ + int last_ok; /* Was last skb sent? + * Or a failed transmit of some sort? This will keep + * sequence numbers in order, for example. + */ + __u64 started_at; /* micro-seconds */ + __u64 stopped_at; /* micro-seconds */ + __u64 idle_acc_ns; /* nano-seconds */ + __u32 seq_num; + + __u32 clone_skb; /* Use multiple SKBs during packet gen. If this number + * is greater than 1, then that many copies of the same + * packet will be sent before a new packet is allocated. + * For instance, if you want to send 1024 identical packets + * before creating a new packet, set clone_skb to 1024. + */ + __u32 peer_clone_skb; /* Peer (transmitter's) clone setting. */ + + char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + + struct in6_addr in6_saddr; + struct in6_addr in6_daddr; + struct in6_addr cur_in6_daddr; + struct in6_addr cur_in6_saddr; + /* For ranges */ + struct in6_addr min_in6_daddr; + struct in6_addr max_in6_daddr; + struct in6_addr min_in6_saddr; + struct in6_addr max_in6_saddr; + + /* If we're doing ranges, random or incremental, then this + * defines the min/max for those ranges. + */ + __u32 saddr_min; /* inclusive, source IP address */ + __u32 saddr_max; /* exclusive, source IP address */ + __u32 daddr_min; /* inclusive, dest IP address */ + __u32 daddr_max; /* exclusive, dest IP address */ + + __u16 udp_src_min; /* inclusive, source UDP port */ + __u16 udp_src_max; /* exclusive, source UDP port */ + __u16 udp_dst_min; /* inclusive, dest UDP port */ + __u16 udp_dst_max; /* exclusive, dest UDP port */ + + /* DSCP + ECN */ + __u8 tos; /* six most significant bits of (former) IPv4 TOS are for dscp codepoint */ + __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 (see RFC 3260, sec. 4) */ + + /* MPLS */ + unsigned nr_labels; /* Depth of stack, 0 = no MPLS */ + __be32 labels[MAX_MPLS_LABELS]; + + + /* VLAN/SVLAN (802.1Q/Q-in-Q) */ + __u8 vlan_p; + __u8 vlan_cfi; + __u16 vlan_id; /* 0xffff means no vlan tag */ + + __u8 svlan_p; + __u8 svlan_cfi; + __u16 svlan_id; /* 0xffff means no svlan tag */ + + + __u32 src_mac_count; /* How many MACs to iterate through */ + __u32 dst_mac_count; /* How many MACs to iterate through */ + + unsigned char dst_mac[ETH_ALEN]; + unsigned char src_mac[ETH_ALEN]; + + __u32 cur_dst_mac_offset; + __u32 cur_src_mac_offset; + __u32 cur_saddr; + __u32 cur_daddr; + __u16 cur_udp_dst; + __u16 cur_udp_src; + __u32 cur_pkt_size; + + __u8 hh[14]; + /* = { + 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, + + We fill in SRC address later + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x08, 0x00 + }; + */ + __u16 pad; /* pad out the hh struct to an even 16 bytes */ + + struct sk_buff *skb; /* skb we are to transmit next, mainly used for when we + * are transmitting the same one multiple times + */ + struct net_device *odev; /* The out-going device. Note that the device should + * have it's pg_info pointer pointing back to this + * device. This will be set when the user specifies + * the out-going device name (not when the inject is + * started as it used to do.) + */ + struct flow_state *flows; + unsigned cflows; /* Concurrent flows (config) */ + unsigned lflow; /* Flow length (config) */ + unsigned nflows; /* accumulated flows (stats) */ + unsigned curfl; /* current sequenced flow (state)*/ +#ifdef CONFIG_XFRM + __u8 ipsmode; /* IPSEC mode (config) */ + __u8 ipsproto; /* IPSEC type (config) */ +#endif + + int avg_latency; /* in micro-seconds */ + int min_latency; + int max_latency; + __u64 latency_bkts[LAT_BUCKETS_MAX]; + __u64 pkts_rcvd_since_clear; /* with regard to clearing/resetting the latency logic */ + + + /* Fields relating to receiving pkts */ + __u32 last_seq_rcvd; + __u64 ooo_rcvd; /* out-of-order packets received */ + __u64 pkts_rcvd; /* packets received */ + __u64 dup_rcvd; /* duplicate packets received */ + __u64 bytes_rcvd; /* total bytes received, as obtained from the skb */ + __u64 seq_gap_rcvd; /* how many gaps we received. This coorelates to + * dropped pkts, except perhaps in cases where we also + * have re-ordered pkts. In that case, you have to tie-break + * by looking at send v/s received pkt totals for the interfaces + * involved. + */ + __u64 non_pg_pkts_rcvd; /* Count how many non-pktgen skb's we are sent to check. */ + __u64 dup_since_incr; /* How many dumplicates since the last seq number increment, + * used to detect gaps when multiskb > 1 + */ +}; + +struct pktgen_hdr { + __u32 pgh_magic; + __u32 seq_num; + __u32 tv_sec; + __u32 tv_usec; +}; + +struct pktgen_thread { + struct list_head if_list; /* All device here */ + struct list_head th_list; + struct task_struct* tsk; + int removed; + char result[512]; + u32 max_before_softirq; /* We'll call do_softirq to prevent starvation. */ + + /* Field for thread to receive "posted" events terminate, stop ifs etc. */ + + u32 control; + char* control_arg; + int pid; + int cpu; + + wait_queue_head_t queue; +}; + +struct pg_nqw_data { + #define PG_NQW_MAGIC 0x82743ab6 + u32 magic; + atomic_t nqw_ref_count; + struct pktgen_thread* pg_thread; +}; + +struct pktgen_dev_report { + __u32 api_version; + __u32 flags; + __u32 min_pkt_size; + __u32 max_pkt_size; + __u32 nfrags; + + __u32 clone_skb; /* Use multiple SKBs during packet gen. If this number + * is greater than 1, then that many copies of the same + * packet will be sent before a new packet is allocated. + * For instance, if you want to send 1024 identical packets + * before creating a new packet, set clone_skb to 1024. + */ + __u32 peer_clone_skb; /* Peer (transmitter's) clone setting. */ + __s32 avg_latency; /* in micro-seconds */ + __s32 min_latency; + __s32 max_latency; + + char thread_name[32]; + char interface_name[32]; + char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + unsigned char dst_mac[ETH_ALEN]; + unsigned char src_mac[ETH_ALEN]; + __u32 pad_32; /* pad to 8-byte boundary */ + + /* If we're doing ranges, random or incremental, then this + * defines the min/max for those ranges. + */ + __u32 saddr_min; /* inclusive, source IP address */ + __u32 saddr_max; /* exclusive, source IP address */ + __u32 daddr_min; /* inclusive, dest IP address */ + __u32 daddr_max; /* exclusive, dest IP address */ + + __u16 udp_src_min; /* inclusive, source UDP port */ + __u16 udp_src_max; /* exclusive, source UDP port */ + __u16 udp_dst_min; /* inclusive, dest UDP port */ + __u16 udp_dst_max; /* exclusive, dest UDP port */ + + /* MPLS */ + __u32 nr_labels; /* Depth of stack, 0 = no MPLS */ + __be32 labels[MAX_MPLS_LABELS]; + + __u32 src_mac_count; /* How many MACs to iterate through */ + __u32 dst_mac_count; /* How many MACs to iterate through */ + + __u64 nflows; /* accumulated flows (stats) */ + __u32 cflows; /* Concurrent flows (config) */ + __u32 lflow; /* Flow length (config) */ + + __u64 delay_ns; /* Delay this much between sending packets. */ + __u64 count; /* Default No packets to send */ + __u64 sofar; /* How many pkts we've sent so far */ + __u64 tx_bytes; /* How many bytes we've transmitted */ + __u64 errors; /* Errors when trying to transmit, pkts will be re-sent */ + __u64 latency_bkts[LAT_BUCKETS_MAX]; + __u64 pkts_rcvd_since_clear; /* with regard to clearing/resetting the latency logic */ + + /* Fields relating to receiving pkts */ + __u64 ooo_rcvd; /* out-of-order packets received */ + __u64 pkts_rcvd; /* packets received */ + __u64 dup_rcvd; /* duplicate packets received */ + __u64 bytes_rcvd; /* total bytes received, as obtained from the skb */ + __u64 seq_gap_rcvd; /* how many gaps we received. This coorelates to + * dropped pkts, except perhaps in cases where we also + * have re-ordered pkts. In that case, you have to tie-break + * by looking at send v/s received pkt totals for the interfaces + * involved. + */ + __u64 non_pg_pkts_rcvd; /* Count how many non-pktgen skb's we are sent to check. */ + + struct in6_addr in6_saddr; + struct in6_addr in6_daddr; + /* For ranges */ + struct in6_addr min_in6_daddr; + struct in6_addr max_in6_daddr; + struct in6_addr min_in6_saddr; + struct in6_addr max_in6_saddr; + + char future_use[256]; /* Give us some room for growth w/out changing structure size */ +} __attribute__((__packed__)); + +/* Define some IOCTLs. Just picking random numbers, basically. */ +#define GET_PKTGEN_INTERFACE_INFO 0x7450 +struct pktgen_ioctl_info { + char thread_name[32]; + char interface_name[32]; + struct pktgen_dev_report report; +}; + + +/* Defined in dev.c */ +extern int (*handle_pktgen_hook)(struct sk_buff *skb); + +/* Returns < 0 if the skb is not a pktgen buffer. */ +int pktgen_receive(struct sk_buff* skb); + + +#endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 35021eb..a079db7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -424,6 +424,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) C(ipvs_property); #endif + C(use_specified_ether_crc); C(protocol); n->destructor = NULL; C(mark); @@ -496,6 +497,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) new->ipvs_property = old->ipvs_property; #endif + new->use_specified_ether_crc = old->use_specified_ether_crc; #ifdef CONFIG_NET_SCHED #ifdef CONFIG_NET_CLS_ACT new->tc_verd = old->tc_verd; diff --git a/net/core/sock.c b/net/core/sock.c index 190de61..1448fd3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -600,6 +600,19 @@ set_rcvbuf: sock_warn_obsolete_bsdism("setsockopt"); break; +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + case SO_NOFCS: + /* printk("SO_NOFCS, valbool: %d, sk: %p\n", + (int)(valbool), sk); */ + if (valbool) { + sk->sk_flags |= SOCK_DONT_DO_LL_FCS; + } + else { + sk->sk_flags &= ~(SOCK_DONT_DO_LL_FCS); + } + break; +#endif + case SO_PASSCRED: if (valbool) set_bit(SOCK_PASSCRED, &sock->flags); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 652da8e..ce10f06 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -415,6 +415,28 @@ static int arp_ignore(struct in_device *in_dev, struct net_device *dev, return !inet_confirm_addr(dev, sip, tip, scope); } + +static int is_ip_on_dev(struct net_device* dev, __u32 ip) { + int rv = 0; + struct in_device* in_dev = in_dev_get(dev); + if (in_dev) { + struct in_ifaddr *ifa; + + rcu_read_lock(); + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + if (ifa->ifa_address == ip) { + /* match */ + rv = 1; + break; + } + } + rcu_read_unlock(); + in_dev_put(in_dev); + } + return rv; +} + + static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) { struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip, @@ -426,8 +448,38 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) if (ip_route_output_key(&rt, &fl) < 0) return 1; if (rt->u.dst.dev != dev) { - NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); - flag = 1; + struct in_device *in_dev = in_dev_get(dev); + if (in_dev && IN_DEV_ACCEPT_STS(in_dev) && + (rt->u.dst.dev == &loopback_dev)) { + /* Accept these IFF target-ip == dev's IP */ + /* TODO: Need to force the ARP response back out the interface + * instead of letting it route locally. + */ + + if (is_ip_on_dev(dev, tip)) { + /* OK, we'll let this special case slide, so that we can + * arp from one local interface to another. This seems + * to work, but could use some review. --Ben + */ + /*printk("arp_filter, sip: %x tip: %x dev: %s, STS override (ip on dev)\n", + sip, tip, dev->name);*/ + } + else { + /*printk("arp_filter, sip: %x tip: %x dev: %s, IP is NOT on dev\n", + sip, tip, dev->name);*/ + NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); + flag = 1; + } + } + else { + /*printk("arp_filter, not lpbk sip: %x tip: %x dev: %s flgs: %hx dst.dev: %p lbk: %p\n", + sip, tip, dev->name, dev->priv_flags, rt->u.dst.dev, &loopback_dev);*/ + NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); + flag = 1; + } + if (in_dev) { + in_dev_put(in_dev); + } } ip_rt_put(rt); return flag; @@ -755,8 +807,8 @@ static int arp_process(struct sk_buff *skb) break; } - /* Understand only these message types */ + /* Understand only these message types */ if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) goto out; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5ccc2d1..3a9a767 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1454,6 +1454,8 @@ static struct devinet_sysctl_table { "force_igmp_version"), DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, "promote_secondaries"), + DEVINET_SYSCTL_RW_ENTRY(ACCEPT_STS, + "accept_sts"), }, .devinet_dev = { { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index eff6bce..2580fa7 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -206,8 +206,16 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, if (fib_lookup(&fl, &res)) goto last_resort; - if (res.type != RTN_UNICAST) - goto e_inval_res; + if (res.type != RTN_UNICAST) { + if ((res.type == RTN_LOCAL) && + (IN_DEV_ACCEPT_STS(in_dev))) { + /* All is OK */ + } + else { + goto e_inval_res; + } + } + *spec_dst = FIB_RES_PREFSRC(res); fib_combine_itag(itag, &res); #ifdef CONFIG_IP_ROUTE_MULTIPATH diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 2a94784..2bd40b9 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -72,6 +72,7 @@ static struct fib4_rule local_rule = { .refcnt = ATOMIC_INIT(2), .table = RT_TABLE_LOCAL, .action = FR_ACT_TO_TBL, + .pref = 0x100, .flags = FIB_RULE_PERMANENT, }, }; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index f813e02..604fe53 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -51,9 +51,10 @@ static int ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, static int ipv4_print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple) { - return seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ", + return seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u mark=%u ", NIPQUAD(tuple->src.u3.ip), - NIPQUAD(tuple->dst.u3.ip)); + NIPQUAD(tuple->dst.u3.ip), + tuple->mark); } static int ipv4_print_conntrack(struct seq_file *s, diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 9731d2c..9e1ecd2 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -78,7 +78,7 @@ hash_by_src(const struct nf_conntrack_tuple *tuple) { /* Original src, to ensure we map it consistently if poss. */ return jhash_3words((__force u32)tuple->src.u3.ip, - (__force u32)tuple->src.u.all, + (__force u32)tuple->src.u.all ^ tuple->mark, tuple->dst.protonum, 0) % nf_nat_htable_size; } @@ -136,7 +136,8 @@ same_src(const struct nf_conn *ct, t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; return (t->dst.protonum == tuple->dst.protonum && t->src.u3.ip == tuple->src.u3.ip && - t->src.u.all == tuple->src.u.all); + t->src.u.all == tuple->src.u.all && + t->mark == tuple->mark); } /* Only called for SRC manip */ @@ -209,7 +210,7 @@ find_best_ips_proto(struct nf_conntrack_tuple *tuple, minip = ntohl(range->min_ip); maxip = ntohl(range->max_ip); j = jhash_2words((__force u32)tuple->src.u3.ip, - (__force u32)tuple->dst.u3.ip, 0); + (__force u32)tuple->dst.u3.ip ^ tuple->mark, 0); *var_ipp = htonl(minip + j % (maxip - minip + 1)); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e089a97..69e22fe 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -612,6 +612,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) ip_hdr(skb)->saddr, /* XXX */ sizeof(struct tcphdr), IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; + arg.bound_dev_if = ((struct rtable *)skb->dst)->fl.iif; ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 097165f..558a8ff 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -246,7 +246,7 @@ static u16 tcp_select_window(struct sock *sk) * * Relax Will Robinson. */ - new_win = cur_win; + new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index e9b151b..00e34de 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -80,7 +80,9 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) if (tcp_too_many_orphans(sk, orphans)) { if (net_ratelimit()) - printk(KERN_INFO "Out of socket memory\n"); + printk(KERN_INFO "Out of socket memory, orphans: %d/%d tcp_memory_allocated: %d/%d\n", + orphans, sysctl_tcp_max_orphans, atomic_read(&tcp_memory_allocated), + sysctl_tcp_mem[2]); /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 3153e15..dc28ca0 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -54,9 +54,10 @@ static int ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, static int ipv6_print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple) { - return seq_printf(s, "src=" NIP6_FMT " dst=" NIP6_FMT " ", + return seq_printf(s, "src=" NIP6_FMT " dst=" NIP6_FMT " mark=%u ", NIP6(*((struct in6_addr *)tuple->src.u3.ip6)), - NIP6(*((struct in6_addr *)tuple->dst.u3.ip6))); + NIP6(*((struct in6_addr *)tuple->dst.u3.ip6)), + tuple->mark); } static int ipv6_print_conntrack(struct seq_file *s, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0fe1188..cb8fe23 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -82,7 +82,7 @@ static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, ((__force __u16)tuple->src.u.all << 16) | (__force __u16)tuple->dst.u.all); - return jhash_2words(a, b, rnd) % size; + return jhash_2words(a, b, tuple->mark ^ rnd) % size; } static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple) @@ -109,6 +109,7 @@ nf_ct_get_tuple(const struct sk_buff *skb, tuple->dst.protonum = protonum; tuple->dst.dir = IP_CT_DIR_ORIGINAL; + tuple->mark = skb->mark; return l4proto->pkt_to_tuple(skb, dataoff, tuple); } @@ -157,8 +158,8 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, return 0; inverse->dst.dir = !orig->dst.dir; - inverse->dst.protonum = orig->dst.protonum; + inverse->mark = orig->mark; return l4proto->invert_tuple(inverse, orig); } EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 3ac64e2..813b7dd 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -220,6 +220,7 @@ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) return NULL; new->master = me; + new->tuple.mark = me->tuplehash[IP_CT_DIR_ORIGINAL].tuple.mark; atomic_set(&new->use, 1); return new; } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 1322d62..ff410d5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -78,6 +78,7 @@ #include #include #include +#include #ifdef CONFIG_INET #include @@ -323,7 +324,14 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, struct net_device *dev; __be16 proto=0; int err; + int kludge = 0; +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + if (sk->sk_flags & SOCK_DONT_DO_LL_FCS) { + kludge = 4; // We're doing our own CRC + } +#endif + /* * Get and verify the address. */ @@ -343,7 +351,7 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, */ saddr->spkt_device[13] = 0; - dev = dev_get_by_name(saddr->spkt_device); + dev = dev_get_by_name(saddr->spkt_device); /* DAMN, we aught to hash this! */ err = -ENODEV; if (dev == NULL) goto out_unlock; @@ -358,7 +366,7 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, */ err = -EMSGSIZE; - if (len > dev->mtu + dev->hard_header_len) + if (len > (dev->mtu + dev->hard_header_len + kludge)) goto out_unlock; err = -ENOBUFS; @@ -400,6 +408,16 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, if (err) goto out_free; +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + if (sk->sk_flags & SOCK_DONT_DO_LL_FCS) { + skb->use_specified_ether_crc = 1; + } + else { + skb->use_specified_ether_crc = 0; + } +#endif + + /* * Now send it */ @@ -710,6 +728,13 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, __be16 proto; unsigned char *addr; int ifindex, err, reserve = 0; + int kludge = 0; + +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + if (sk->sk_flags & SOCK_DONT_DO_LL_FCS) { + kludge = 4; // We're doing our own CRC + } +#endif /* * Get and verify the address. @@ -745,7 +770,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, goto out_unlock; err = -EMSGSIZE; - if (len > dev->mtu+reserve) + if (len > (dev->mtu + reserve + kludge)) goto out_unlock; skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev), @@ -776,6 +801,15 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, skb->dev = dev; skb->priority = sk->sk_priority; +#ifdef CONFIG_SUPPORT_SEND_BAD_CRC + if (sk->sk_flags & SOCK_DONT_DO_LL_FCS) { + skb->use_specified_ether_crc = 1; + } + else { + skb->use_specified_ether_crc = 0; + } +#endif + /* * Now send it */ diff --git a/net/redir/Kconfig b/net/redir/Kconfig new file mode 100644 index 0000000..3abfbe1 --- /dev/null +++ b/net/redir/Kconfig @@ -0,0 +1,7 @@ +config REDIRDEV + tristate "Redirect-net-device support" + depends on EXPERIMENTAL + ---help--- + This allows one to create virtual interfaces that effectively + swap tx for rx, allowing one to create bridges and similar + constructs all in the same machine. diff --git a/net/redir/Makefile b/net/redir/Makefile new file mode 100644 index 0000000..70d4dcb --- /dev/null +++ b/net/redir/Makefile @@ -0,0 +1,10 @@ +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +obj-$(CONFIG_REDIRDEV) := redirdev.o + + diff --git a/net/redir/redirdev.c b/net/redir/redirdev.c new file mode 100644 index 0000000..4e287f6 --- /dev/null +++ b/net/redir/redirdev.c @@ -0,0 +1,975 @@ +/* -*- linux-c -*- +####################################################################### +# +# (C) Copyright 2005 +# Ben Greear +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of +# the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307 USA +####################################################################### +# Notes: +# +# This file implements the Redirect-net-device module. A pair of +# redir devices linked to each other act like two ethernet interfaces +# connected with a cross-over cable. +# +# This provides an IOCTL interface which allows you to +# It uses an IOCTL interface which allows you to +# +# 1. create redirect device +# 2. delete redirect device +# +####################################################################### +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef CONFIG_PROC_FS +#include +#define RDD_PROC_DIR "redirdev" +#define RDD_PROC_CFG "config" +static struct proc_dir_entry *rdd_proc_dir; +static struct proc_dir_entry *rdd_proc_cfg; +#endif + +#include "redirdev.h" + +/* Defined in socket.c */ +void redirdev_ioctl_set(int (*hook)(void*)); +static int redirdev_device_event(struct notifier_block *unused, + unsigned long event, void *ptr); + +static struct notifier_block redirdev_notifier_block = { + .notifier_call = redirdev_device_event, +}; + +/*********************************************************/ +/* defines */ +/*********************************************************/ + +/* Must hold this lock to make any changes to the Redirect-Device structures. + */ +static spinlock_t rdd_cfg_lock = SPIN_LOCK_UNLOCKED; + +static int do_quotas = 1; /* global enable/disable for quota logic. */ + + +/*********************************************************/ +/* file scope variables */ +/*********************************************************/ + +static struct redirdev* rdds = NULL; + +static atomic_t rdd_dev_counter; + +static int debug_lvl = 0; + + +/*********************************************************/ +/* forward declarations */ +/*********************************************************/ + +#ifdef RDD_CONFIG_PROC_FS +static int read_rdd_glbl(char *page, char **start, off_t off, + int count, int *eof, void *data); +static int write_rdd_glbl(struct file *file, const char *buffer, + unsigned long count, void *data); +#endif + + + +/*********************************************************/ +/* function definitions */ +/*********************************************************/ + + +#define iswhitespace(x)\ + ((x) == ' ' || (x) == '\n' || (x) == '\r' || (x) == '\r' ) + +#define skip_whitespace(x) { while (iswhitespace(*x)) (x)++; } + +static int copy_next_word(char *dst, char *src, int len) { + char *p; + for (p=src; p < src + len ; p++) { + if ( iswhitespace(*p)) + break; + *dst++ = *p; + } + return p - src; +} + +/* Grab the RDD lock before calling this method. */ +struct redirdev* rdd_find_dev_by_name(const char* ifname) { + struct redirdev* d; + //printk("finding port for underlying ifname: %s\n", ifname); + for (d = rdds; d; d = d->next) { + //printk("Testing port: %p name: %s\n", port, port->dev->name); + if (strcmp(d->dev->name, ifname) == 0) { + break; + } + } + //printk("done finding port: %p\n", port); + return d; +} + +/* Grab the RDD lock before calling this method. */ +struct redirdev* rdd_find_dev_by_txdev_name(const char* ifname) { + struct redirdev* d; + for (d = rdds; d; d = d->next) { + if (d->tx_dev) { + if (strcmp(d->tx_dev->name, ifname) == 0) { + break; + } + } + } + return d; +} + + +static struct net_device_stats *redirdev_get_stats(struct net_device *dev) +{ + struct redirdev* rdd = dev->priv; + + return &rdd->statistics; +} + +/** Bump our tx counters and then act as if this was received from + * the network on the tx_dev device. Since we don't do any CSUM + * activity in this driver, make sure SKB as marked as not checksummed + * yet. + */ +static int redirdev_xmit(struct sk_buff *skb, struct net_device *dev) { + struct redirdev* rdd = dev->priv; + struct net_device_stats* txs; + + if (unlikely((!rdd->tx_dev) && rdd->wants_to_run)) { + rdd->tx_dev = dev_get_by_name(rdd->tx_dev_name); + if (rdd->tx_dev) { + printk("redir: Associated tx_dev_name: %s with device: %p in redirdev_xmit\n", + rdd->tx_dev_name, rdd->tx_dev); + } + } + + if (unlikely(!rdd->tx_dev)) { + printk("ERROR: tx_dev null in redirdev_xmit.\n"); + kfree_skb(skb); + rdd->statistics.tx_errors++; + goto out; + } + + skb_orphan(skb); /* release this skb from the sending socket's accounting. */ + + //printk("%s: dev: %s tx_dev: %s\n", + // __PRETTY_FUNCTION__, dev->name, rdd->tx_dev->name); + + if (netif_running(rdd->tx_dev)) { + + /* We need to free the old skb so that the socket + * account works correctly. We'll make a copy and + * then forward that to the other device. + */ + + int rv; + skb->dev = rdd->tx_dev; + + /* We didn't calculate the csum, so mark as such. */ + skb->ip_summed = CHECKSUM_UNNECESSARY;//NONE; + + rdd->statistics.tx_packets++; + rdd->statistics.tx_bytes += skb->len; + + txs = rdd->tx_dev->get_stats(rdd->tx_dev); + txs->rx_packets++; + txs->rx_bytes += skb->len; + + /* Zero out the time-stamp so that receiving code is forced + * to recalculate it. + */ + skb->tstamp.tv64 = 0; + + /* Call this on the receiving net device. This assumes + * that all devices are ethernet or ethernet-like. Valid + * for now. TODO: Generalize tx_dev ?? + */ + skb->pkt_type = PACKET_HOST; //Reset this to default. + + skb->protocol = eth_type_trans(skb, skb->dev); + + if (skb->dst) { + dst_release(skb->dst); + skb->dst = NULL; + } + + /* Remove any connection tracking info */ + nf_reset(skb); + /* Clear skb->mark */ + skb->mark = 0; + + rdd->dev->trans_start = jiffies; + + //printk("skb->protocol: %x pkt_type: %u\n", + // (unsigned int)(skb->protocol), + // (unsigned int)(skb->pkt_type)); + rv = netif_rx(skb); + if (rv != NET_RX_SUCCESS) { + // TODO: Remove + //printk("netif_rx rv: %i\n", (int)(rv)); + } + + rdd->tx_dev->last_rx = jiffies; + } + else { + /* Chunk the packet and log some errors */ + rdd->statistics.tx_errors++; + kfree_skb(skb); + } + +out: + /* -1 means ignore, and -2 means ignore sets as well. This is to + * disable logic that uses this code w/out the calling code knowing + * Used for debugging. + */ + if (do_quotas && (rdd->quota != 0xFFFFFFFF)) { + if (rdd->quota > 0) { + rdd->quota--; + } + if (rdd->quota == 0) { + // Stop the tx-queue + netif_stop_queue(dev); + } + } + + return 0; +}/* redir xmit */ + +static int redirdev_open(struct net_device *dev) { + struct redirdev* rdd = dev->priv; + rdd->wants_to_run = 1; + if (!rdd->tx_dev) { + rdd->tx_dev = dev_get_by_name(rdd->tx_dev_name); + } + if (!rdd->tx_dev) { + printk("redir: %s Warning: Could not find tx_dev: %s, will try later in redirdev_xmit.\n", + dev->name, rdd->tx_dev_name); + } + + printk("redirdev: Starting device: %s\n", dev->name); + netif_start_queue(dev); + return 0; +} + +//static void redirdev_set_multicast_list(struct net_device *dev) { + /* TODO ??? */ +//} + +static int redirdev_stop(struct net_device *dev) { + struct redirdev* rdd = dev->priv; + printk("redirdev: stopping device: %s\n", dev->name); + netif_stop_queue(dev); + rdd->wants_to_run = 0; + if (rdd->tx_dev) { + struct net_device* tmp = rdd->tx_dev; + rdd->tx_dev = NULL; + printk(" releasing reference to dev: %s\n", tmp->name); + dev_put(tmp); + } + printk(" done stopping %s\n", dev->name); + return 0; +} + + +void redirdev_dev_destructor(struct net_device *dev) { + atomic_dec(&rdd_dev_counter); + if (dev->priv) { + //printk("dst: %s", dev->name); + kfree(dev->priv); + dev->priv = NULL; + } + else { + //printk("dst2: %s", dev->name); + } +} + +int redirdev_change_mtu(struct net_device *dev, int new_mtu) { + dev->mtu = new_mtu; + return 0; +} + +static int redirdev_create(const char* newifname, + const char* txdevname) { + struct redirdev *rdd = NULL; + struct net_device* td = NULL; + struct net_device* nnd = NULL; + struct net_device* txd = NULL; + unsigned long flags; + int rv; + + if ((strlen(txdevname) == 0) || + (strlen(newifname) == 0)) { + printk("redirdev: ERROR: Must specify ifname and txifname" + " when creating redirect devices!\n"); + rv = -ENODEV; + goto out; + } + + printk("redirdev: creating interface: -:%s:- with tx_dev: -:%s:-\n", + newifname, txdevname); + + + //printk("malloc "); + if ((rdd = kmalloc(sizeof(*rdd), GFP_KERNEL)) == NULL) { + //printk("redirdev: kmalloc failure\n"); + rv = -ENOMEM; + goto outfree; + } + memset(rdd, 0, sizeof(*rdd)); + rdd->quota = 0xFFFFFFFF; // Default to not use quota. + + //printk("4 "); + if ((nnd = kmalloc(sizeof(struct net_device), GFP_KERNEL)) == NULL) { + //printk("redirdev: kmalloc net_device failure\n"); + rv = -ENOMEM; + goto outfree; + } + memset(nnd, 0, sizeof(struct net_device)); + + if ((td = dev_get_by_name(newifname)) != NULL) { + //printk("redirdev: device by that name already exists\n"); + rv = -EEXIST; + goto outfree; + } + + /* If it's not here yet, no problem, will associate later */ + txd = dev_get_by_name(txdevname); + strncpy(rdd->tx_dev_name, txdevname, IFNAMSIZ); + + //printk("4 "); + rdd->dev = nnd; + + //printk("5 "); + strncpy(rdd->dev->name, newifname, IFNAMSIZ-1); + rdd->dev->name[IFNAMSIZ-1] = 0; //Ensure null termination. + ether_setup(rdd->dev); + + dev_hold(rdd->dev); /* RDD code holds reference */ + + rdd->dev->priv = rdd; + rdd->tx_dev = txd; + + //printk("6 "); + rdd->dev->get_stats = redirdev_get_stats; + rdd->dev->hard_start_xmit = redirdev_xmit; + rdd->dev->change_mtu = redirdev_change_mtu; + rdd->dev->open = redirdev_open; + rdd->dev->stop = redirdev_stop; + rdd->dev->destructor = redirdev_dev_destructor; + + // Defaults are fine for these + //rdd->dev->rebuild_header = redirdev_dev_rebuild_header; + //rdd->dev->set_multicast_list = redirdev_set_multicast_list; + //rdd->dev->hard_header = redirdev_hard_header; + + rdd->dev->dev_addr[0] = 0; + rdd->dev->dev_addr[1] = net_random(); + rdd->dev->dev_addr[2] = net_random(); + rdd->dev->dev_addr[3] = net_random(); + rdd->dev->dev_addr[4] = net_random(); + rdd->dev->dev_addr[5] = net_random(); + + /* No qdisc for us */ + rdd->dev->qdisc = NULL; + rdd->dev->tx_queue_len = 0; + + //printk("redirdev: created redirect-device %p\n", vlan); + + /* link to list */ + //printk("8 "); + spin_lock_irqsave(&rdd_cfg_lock, flags); + rdd->next = rdds; + rdds = rdd; + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + //printk("End of redirdev_create, registering rdd->dev: %p (%s)\n", + // rdd->dev, rdd->dev->name); + + register_netdev(rdd->dev); + + //printk("End of mac_vlan create2\n"); + + atomic_inc(&rdd_dev_counter); + //printk("9\n"); + rv = 0; + goto out; + + /* Error case, clean up vlan memory */ + outfree: + if (rdd) { + kfree(rdd); + } + if (nnd) { + kfree(nnd); + } + if (td) { + dev_put(td); + } + if (txd) { + dev_put(txd); + } + out: + return rv; +} /* redirdev_create */ + +static int redirdev_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) { + struct net_device* dev = ptr; + struct redirdev* rdd; + unsigned long flags; + + spin_lock_irqsave(&rdd_cfg_lock, flags); + rdd = rdd_find_dev_by_txdev_name(dev->name); + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + if (!rdd) { + //printk("redirdev: Ignoring event: %lu for device: %s\n", + // event, dev->name); + goto out; + } + + + /* It is OK that we do not hold the group lock right now, + * as we run under the RTNL lock. + */ + + switch (event) { + case NETDEV_CHANGE: + case NETDEV_DOWN: + //printk("redirdev: Ignoring change/up/down for device: %s\n", + // dev->name); + /* Ignore for now */ + break; + + case NETDEV_UP: + /* Start the redir-dev too if it wants to run */ + if ((!netif_running(rdd->dev)) && rdd->wants_to_run) { + printk("Device: %s is up, starting redir-device: %s too.\n", + dev->name, rdd->dev->name); + dev_open(rdd->dev); + } + break; + + case NETDEV_UNREGISTER: + /* Stop the redir-dev too */ + printk("Device: %s is going away, closing redir-device: %s too.\n", + dev->name, rdd->dev->name); + if (rdd->dev->flags & IFF_UP) { + /* Graceful shutdown, drop links to our peer. */ + dev_close(rdd->dev); + } + else { + /* Still drop links to peer...but dev_close would not have done anything. */ + redirdev_stop(rdd->dev); + } + rdd->wants_to_run = 1; /* was forced down. */ + break; + + }; + +out: + return NOTIFY_DONE; +} + +/* Has locking internally */ +int redirdev_cleanup(const char* ifname, int force) { + struct redirdev* d; //walker + struct redirdev* prev = NULL; + unsigned long flags; + int rv; + + //printk(__FUNCTION__"(%p)\n",vlan); + //printk("rdd_cln: %s", ifname); + + spin_lock_irqsave(&rdd_cfg_lock, flags); + for (d = rdds; d; d = d->next) { + if (strcmp(d->dev->name, ifname) == 0) { + if ((d->dev->flags & IFF_UP) && (!force)) { + rv = -EBUSY; + goto unlockout; + } + + // Un-link from the list. + if (prev) { + prev->next = d->next; + d->next = NULL; + } + else { + // This means we're first in line + rdds = d->next; + d->next = NULL; + } + + break; + } + prev = d; + } + + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + if (d) { + if (d->dev->flags & IFF_UP) { + BUG_ON(!force); + + rtnl_lock(); + dev_close(d->dev); + rtnl_unlock(); + } + + if (d->tx_dev) { + dev_put(d->tx_dev); + } + + dev_put(d->dev); + unregister_netdev(d->dev); + rv = 0; + } + else { + rv = -ENODEV; + } + goto out; + + unlockout: + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + out: + return rv; +} /* redirdev cleanup */ + + +static int redirdev_ioctl_deviceless_stub(void* arg) { + int err = 0; + struct redirdev_ioctl req; + unsigned long flags; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&req, arg, sizeof(req))) + return -EFAULT; + + switch (req.cmd) { + case REDIRDEV_ADD: { + /* + * create a new redirect device + */ + req.txifname[IFNAMSIZ-1] = '\0'; + req.ifname[IFNAMSIZ-1] = '\0'; + printk("Creating redir via ioctl, ifname: %s txifname: %s\n", + req.ifname, req.txifname); + + /* Has internal locking. */ + err = redirdev_create(req.ifname, req.txifname); + break; + } + case REDIRDEV_DEL: { + /* + * destroy a redirect device + */ + req.ifname[IFNAMSIZ-1] = '\0'; + + /* Has internal locking */ + err = redirdev_cleanup(req.ifname, 0); + break; + } + + case REDIRDEV_IS_REDIRDEV: { + /* + * Give user-space a chance of determining if we are a redirect-device + * or not. + * (If the IOCTL fails, we are not, otherwise we are.) + */ + struct redirdev* rdd; + req.ifname[IFNAMSIZ-1] = '\0'; + + spin_lock_irqsave(&rdd_cfg_lock, flags); + /* find the port in question */ + rdd = rdd_find_dev_by_name(req.ifname); + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + if (!rdd) { + /* printk("device: %s is NOT a REDIR device\n", ifname); */ + err = -ENODEV; + } + else { + /* printk("device: %s IS a MAC-VLAN\n", ifname); */ + err = 0; + } + break; + } + case REDIRDEV_SET_QUOTA: { + /* + * Set the quota. 0xFFFFFFFF means disable quota logic. + * (If the IOCTL fails, we are not, otherwise we are.) + */ + struct redirdev* rdd; + struct net_device* dev; + + // Get device by idx; + dev = dev_get_by_index(req.ifidx); + if (dev) { + if (dev->get_stats == redirdev_get_stats) { + rdd = dev->priv; + rdd->quota = req.flags; + netif_wake_queue(dev); + } + else { + err = -EINVAL; + } + dev_put(dev); + } + else { + err = -ENODEV; + } + break; + } + case REDIRDEV_GET_BY_IDX: { + /* + * get the nth redirdev name + */ + struct redirdev *rdd; + int n = req.ifidx; + + spin_lock_irqsave(&rdd_cfg_lock, flags); + /* find the port in question */ + for (rdd = rdds; rdd && n; rdd = rdd->next, n--); + if (!rdd) { + err = -ENODEV; + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + } + else { + memcpy(req.ifname, rdd->dev->name, IFNAMSIZ); + memcpy(req.txifname, rdd->tx_dev_name, IFNAMSIZ); + if (rdd->tx_dev) { + req.flags |= RDD_ASSOCIATED; + } + else { + req.flags &= ~RDD_ASSOCIATED; + } + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + if (copy_to_user(arg, &req, sizeof(req))) { + err = -EFAULT; + } + } + break; + } + case REDIRDEV_GET_BY_NAME: { + /* + * get info on the specified redirect device + */ + struct redirdev *rdd; + req.ifname[IFNAMSIZ-1] = '\0'; + + spin_lock_irqsave(&rdd_cfg_lock, flags); + /* find the port in question */ + rdd = rdd_find_dev_by_name(req.ifname); + + if (!rdd) { + err = -ENODEV; + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + } + else { + memcpy(req.ifname, rdd->dev->name, IFNAMSIZ); + memcpy(req.txifname, rdd->tx_dev_name, IFNAMSIZ); + if (rdd->tx_dev) { + req.flags |= RDD_ASSOCIATED; + } + else { + req.flags &= ~RDD_ASSOCIATED; + } + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + if (copy_to_user(arg, &req, sizeof(req))) { + err = -EFAULT; + } + } + break; + } + default: + printk("ERROR: Un-supported redirdev ioctl command: %u\n", + (unsigned int)(req.cmd)); + send_sig(SIGSEGV, current, 1); // TODO: Remove + err = -EOPNOTSUPP; + break; + }//switch + + /* printk("Returning err: %i\n", err); */ + return err; +}/* ioctl handler */ + + +#ifdef RDD_CONFIG_PROC_FS + +static int read_rdd_glbl(char *page, char **start, off_t off, + int count, int *eof, void *data) { + int ret = -1; + char *p = page; + int mx_len = (4096 - (p - page)); + + if (! *eof ) { + struct redirdev* rdd; + int cnt; + unsigned long flags; + + /* Global counts here... */ + p += sprintf(p, "Redirect-devices: %i quotas-enabled: %i\ndev\ttx-dev\tquota\n", + atomic_read(&rdd_dev_counter), do_quotas); + + spin_lock_irqsave(&rdd_cfg_lock, flags); + rdd = rdds; + while (rdd) { + if (rdd->tx_dev) { + p += sprintf(p, "%s\t%s\t%i\n", + rdd->dev->name, rdd->tx_dev->name, rdd->quota); + } + else { + p += sprintf(p, " %s\t[%s]\t%i\n", + rdd->dev->name, rdd->tx_dev_name, rdd->quota); + } + + /* catch overflow */ + cnt = p - page; + if (cnt > (mx_len - 60)) { + if (mx_len - cnt >= 20) { + p += sprintf(p, "OUT_OF_SPACE!\n"); + } + break; + } + + rdd = rdd->next; + } + + ret = p - page; + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + } + return ret; +} /* read_rdd_glbl */ + +static int write_rdd_glbl(struct file *file, const char *buffer, + unsigned long count, void *data) { + char *p; + const char *end; + int ret=count; + int len; + char dev_name[2][IFNAMSIZ]; + char* tmps = NULL; + int tmp_rv; + char ss[50]; + end = buffer + count; + + snprintf(ss, 50, "redir proc cmd: %%.%lus", count); + + printk(ss, buffer); + + for (p= (char *) buffer; p< end ; ) { + if (iswhitespace(*p)) { + p++; + continue; + } + + memset(dev_name[0], 0 ,IFNAMSIZ); + memset(dev_name[1], 0 ,IFNAMSIZ); + + len = strlen("add_rdd "); + if (strncmp(p, "add_rdd ", len)==0) + { + p += len; + + if ( (p + IFNAMSIZ) <= end) + p += copy_next_word(dev_name[0], p, IFNAMSIZ); + else + p += copy_next_word(dev_name[0], p, end-p ); + + skip_whitespace(p); + + if ( (p + IFNAMSIZ) <= end) + p += copy_next_word(dev_name[1], p, IFNAMSIZ); + else + p += copy_next_word(dev_name[1], p, end-p ); + + skip_whitespace(p); + + /* This can fail, but not sure how to return failure + * to user-space here. + * NOTE: Does it's own internal locking. + */ + redirdev_create(dev_name[0], dev_name[1]); + goto forend; + } + + len = strlen("remove_rdd "); + if (strncmp(p,"remove_rdd ", len)==0) { + p += len; + + if ( (p + IFNAMSIZ) <= end) + p += copy_next_word(dev_name[0], p, IFNAMSIZ); + else + p += copy_next_word(dev_name[0], p, end-p ); + + skip_whitespace(p); + + tmp_rv = redirdev_cleanup(dev_name[0], 0); + if (tmp_rv < 0) { + printk("redirdev: ERROR: Failed redirdev_cleanup, error: %d\n", tmp_rv); + } + + goto forend; + } + + len = strlen("debug_lvl "); + if (strncmp(p,"debug_lvl ",len)==0) + { + p += len; + + if ( (p + IFNAMSIZ) <= end) + p += copy_next_word(dev_name[0], p, IFNAMSIZ); + else + p += copy_next_word(dev_name[0], p, end-p ); + + skip_whitespace(p); + + debug_lvl = simple_strtoul(dev_name[0], &tmps, 10); + goto forend; + } + + len = strlen("do_quotas "); + if (strncmp(p,"do_quotas ",len)==0) { + p += len; + + if ( (p + IFNAMSIZ) <= end) + p += copy_next_word(dev_name[0], p, IFNAMSIZ); + else + p += copy_next_word(dev_name[0], p, end-p ); + + skip_whitespace(p); + + do_quotas = simple_strtoul(dev_name[0], &tmps, 10); + goto forend; + } + + printk("ERROR: Unsupported command\n"); + + forend: + p++; + } + + return ret; +} /* write_rdd_glbl */ + +#endif + + +static int __init redirdev_init(void) { + int err; + printk(KERN_INFO "Redirect-Network-Device: 1.0 \n"); + + rdds = NULL; + + redirdev_ioctl_set(redirdev_ioctl_deviceless_stub); + +#ifdef RDD_CONFIG_PROC_FS + + rdd_proc_dir = proc_mkdir(RDD_PROC_DIR, proc_net); + if (rdd_proc_dir) { + rdd_proc_cfg = create_proc_read_entry(RDD_PROC_CFG, S_IRUGO, rdd_proc_dir, + read_rdd_glbl, NULL); + if (rdd_proc_cfg) { + rdd_proc_cfg->write_proc = write_rdd_glbl; + rdd_proc_cfg->owner = THIS_MODULE; + } + } +#endif + + /* Register us to receive netdevice events */ + err = register_netdevice_notifier(&redirdev_notifier_block); + if (err < 0) { + printk("ERROR: redirdev: Failed to register netdevice notifier callback!\n"); + } + + return 0; +} + +static void redirdev_module_cleanup(void) { + char nm[IFNAMSIZ+1]; + unsigned long flags; + + redirdev_ioctl_set(NULL); + + spin_lock_irqsave(&rdd_cfg_lock, flags); + /* destroy all redirect devices */ + while (rdds) { + strncpy(nm, rdds->dev->name, IFNAMSIZ); + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + if (redirdev_cleanup(nm, 1) < 0) { + printk("redirdev: ERROR: Failed redir_cleanup in redir_module_cleanup\n"); + + } + spin_lock_irqsave(&rdd_cfg_lock, flags); + } + spin_unlock_irqrestore(&rdd_cfg_lock, flags); + + /* Un-register us from receiving netdevice events */ + unregister_netdevice_notifier(&redirdev_notifier_block); + +#ifdef RDD_CONFIG_PROC_FS + if (rdd_proc_cfg) { + remove_proc_entry(RDD_PROC_CFG, rdd_proc_dir); + rdd_proc_cfg = NULL; + } + if (rdd_proc_dir) { + remove_proc_entry(RDD_PROC_DIR, proc_net); + rdd_proc_dir = NULL; + } +#endif + +}/* redirdev_cleanup */ + + +module_init(redirdev_init); +module_exit(redirdev_module_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/redir/redirdev.h b/net/redir/redirdev.h new file mode 100644 index 0000000..223a00d --- /dev/null +++ b/net/redir/redirdev.h @@ -0,0 +1,41 @@ +/* -*- linux-c -*- + +# (C) Copyright 2005 +# Ben Greear +# Released under the GPL version 2 +*/ + +#ifndef REDIRDEV_KERNEL_H_FILE__ +#define REDIRDEV_KERNEL_H_FILE__ + + +/* Proc file related */ +#define RDD_MX_ARG_LEN 80 + +#ifdef CONFIG_PROC_FS + +/* To use or not to use the PROC-FS */ +#define RDD_CONFIG_PROC_FS + +#endif + + +/*********************************************************/ +/* types */ +/*********************************************************/ +struct redirdev { + /* Can be NULL if not yet associated */ + struct net_device* tx_dev; /* Call rx on this device when a packet + * is _transmitted_ on this redirect + * device. + */ + struct net_device* dev; /* the device struct this belongs too */ + struct redirdev *next; + char tx_dev_name[IFNAMSIZ]; + struct net_device_stats statistics; + int wants_to_run; /* Should we be running if we can? */ + u32 quota; /* Used for crude rate limitation. 0xFFFFFFFF means run forever */ +}; + +#endif + diff --git a/net/socket.c b/net/socket.c index 8e5be74..e628826 100644 --- a/net/socket.c +++ b/net/socket.c @@ -813,6 +813,30 @@ void vlan_ioctl_set(int (*hook) (void __user *)) EXPORT_SYMBOL(vlan_ioctl_set); +static DEFINE_MUTEX(macvlan_ioctl_mutex); +static int (*macvlan_ioctl_hook)(void __user*); + +void macvlan_ioctl_set(int (*hook)(void __user*)) +{ + mutex_lock(&macvlan_ioctl_mutex); + macvlan_ioctl_hook = hook; + mutex_unlock(&macvlan_ioctl_mutex); +} +EXPORT_SYMBOL(macvlan_ioctl_set); + + +static DEFINE_MUTEX(redirdev_ioctl_mutex); +static int (*redirdev_ioctl_hook)(void __user*); + +void redirdev_ioctl_set(int (*hook)(void __user*)) +{ + mutex_lock(&redirdev_ioctl_mutex); + redirdev_ioctl_hook = hook; + mutex_unlock(&redirdev_ioctl_mutex); +} +EXPORT_SYMBOL(redirdev_ioctl_set); + + static DEFINE_MUTEX(dlci_ioctl_mutex); static int (*dlci_ioctl_hook) (unsigned int, void __user *); @@ -882,6 +906,28 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) err = vlan_ioctl_hook(argp); mutex_unlock(&vlan_ioctl_mutex); break; + case SIOCGIFMACVLAN: + case SIOCSIFMACVLAN: + err = -ENOPKG; + if (!macvlan_ioctl_hook) + request_module("macvlan"); + + mutex_lock(&macvlan_ioctl_mutex); + if (macvlan_ioctl_hook) + err = macvlan_ioctl_hook(argp); + mutex_unlock(&macvlan_ioctl_mutex); + break; + case SIOCGIFREDIRDEV: + case SIOCSIFREDIRDEV: + err = -ENOPKG; + if (!redirdev_ioctl_hook) + request_module("redirdev"); + + mutex_lock(&redirdev_ioctl_mutex); + if (redirdev_ioctl_hook) + err = redirdev_ioctl_hook(argp); + mutex_unlock(&redirdev_ioctl_mutex); + break; case SIOCADDDLCI: case SIOCDELDLCI: err = -ENOPKG;