From 90834e19a0babaa6fdddd197c56977898b7a74b3 Mon Sep 17 00:00:00 2001 From: Syrone Wong Date: Sat, 9 Apr 2022 01:31:10 +0800 Subject: [PATCH] nft-fullcone: init OpenWrt package Signed-off-by: Syrone Wong --- Makefile | 45 ++ src/Kconfig | 18 + src/Lindent | 29 + src/Makefile | 18 + src/nf_nat_fullcone.c | 1611 ++++++++++++++++++++++++++++++++++++++++ src/nf_nat_fullcone.h | 156 ++++ src/nft_ext_fullcone.c | 458 ++++++++++++ 7 files changed, 2335 insertions(+) create mode 100644 Makefile create mode 100644 src/Kconfig create mode 100755 src/Lindent create mode 100644 src/Makefile create mode 100644 src/nf_nat_fullcone.c create mode 100644 src/nf_nat_fullcone.h create mode 100644 src/nft_ext_fullcone.c diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5df6b63 --- /dev/null +++ b/Makefile @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Copyright (c) 2018 Chion Tang +# Original xt_FULLCONENAT and related iptables extension author +# Copyright (c) 2019-2022 GitHub/llccd Twitter/@gNodeB +# Added IPv6 support for xt_FULLCONENAT and ip6tables extension +# Ported to recent kernel versions +# Copyright (c) 2022 Syrone Wong +# Massively rewrite the whole module, split the original code into library and nftables 'fullcone' expression module + +include $(TOPDIR)/rules.mk +include $(INCLUDE_DIR)/kernel.mk + +PKG_NAME:=nft-fullcone +PKG_RELEASE:=1 + +include $(INCLUDE_DIR)/package.mk + +define KernelPackage/nft-fullcone + SUBMENU:=Netfilter Extensions + DEPENDS:=@IPV6 +kmod-nft-core +kmod-nf-conntrack +kmod-nf-conntrack6 + TITLE:=nftables fullcone expression support + FILES:= $(PKG_BUILD_DIR)/nft_fullcone.ko + KCONFIG:= CONFIG_NFT_FULLCONE=y CONFIG_NF_NAT=y CONFIG_NF_NAT_IPV6=y CONFIG_NF_CONNTRACK_EVENTS=y CONFIG_NF_CONNTRACK_CHAIN_EVENTS=y + PROVIDES:=$(PKG_NAME) + AUTOLOAD:=$(call AutoProbe,nft_fullcone) +endef + +define KernelPackage/nft-fullcone/Description +Kernel module adds the fullcone expression that you can use +to perform NAT in the RFC3489-compatible full cone SNAT flavour. +Currently only UDP traffic is supported for full-cone NAT. +For other protos FULLCONENAT is equivalent to MASQUERADE. +endef + +# make use of all CPUs +define Build/Compile + +$(MAKE) $(PKG_JOBS) $(KERNEL_MAKEOPTS) \ + M="$(PKG_BUILD_DIR)" \ + EXTRA_CFLAGS="$(BUILDFLAGS)" \ + $(if $(CONFIG_IPv6),EXTRA_CFLAGS+="-DCONFIG_SFE_ECM",) \ + modules + +endef + +$(eval $(call KernelPackage,nft-fullcone)) diff --git a/src/Kconfig b/src/Kconfig new file mode 100644 index 0000000..2c82c25 --- /dev/null +++ b/src/Kconfig @@ -0,0 +1,18 @@ +# +# Nftables/netfilter fullcone expression support +# +config NFT_FULLCONE + depends on NF_CONNTRACK + depends on NF_NAT + tristate "Netfilter nf_tables fullcone support" + help + This options adds the "fullcone" expression that you can use + to perform NAT in the RFC3489-compatible full cone SNAT flavour. + Currently only UDP traffic is supported for full-cone NAT. + For other protos FULLCONENAT is equivalent to MASQUERADE. + + To compile this code as a module, choose M here: the module will be + called nft_fullcone. + + If unsure, say N. + diff --git a/src/Lindent b/src/Lindent new file mode 100755 index 0000000..7393661 --- /dev/null +++ b/src/Lindent @@ -0,0 +1,29 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# original +#PARAM="-npro -kr -i8 -ts8 -sob -l80 -ss -ncs -cp1" +# use wide screen in 21th century +PARAM="-npro -kr -i8 -ts8 -sob -l120 -ss -ncs -cp1" + +RES=$(indent --version | cut -d' ' -f3) +if [ "$RES" = "" ]; then + exit 1 +fi +V1=$(echo $RES | cut -d'.' -f1) +V2=$(echo $RES | cut -d'.' -f2) +V3=$(echo $RES | cut -d'.' -f3) + +if [ $V1 -gt 2 ]; then + PARAM="$PARAM -il0" +elif [ $V1 -eq 2 ]; then + if [ $V2 -gt 2 ]; then + PARAM="$PARAM -il0" + elif [ $V2 -eq 2 ]; then + if [ $V3 -ge 10 ]; then + PARAM="$PARAM -il0" + fi + fi +fi + +indent $PARAM "$@" diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..8427ebb --- /dev/null +++ b/src/Makefile @@ -0,0 +1,18 @@ +# +# Makefile for Nftables/netfilter fullcone expression support. +# + +# module name, should not have the same name as src file names +TARGET = nft_fullcone + +obj-m += $(TARGET).o + +$(TARGET)-objs := \ + nf_nat_fullcone.o \ + nft_ext_fullcone.o + +# product +ccflags-y += -Werror -Wall + +# develop +#ccflags-y += -Wall -Wno-unused-function diff --git a/src/nf_nat_fullcone.c b/src/nf_nat_fullcone.c new file mode 100644 index 0000000..b60e31a --- /dev/null +++ b/src/nf_nat_fullcone.c @@ -0,0 +1,1611 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Nftables NAT extension: fullcone expression support library + * + * Copyright (c) 2018 Chion Tang + * Original xt_FULLCONENAT and related iptables extension author + * Copyright (c) 2019-2022 GitHub/llccd Twitter/@gNodeB + * Added IPv6 support for xt_FULLCONENAT and ip6tables extension + * Ported to recent kernel versions + * Copyright (c) 2022 Syrone Wong + * Massively rewrite the whole module, split the original code into library and nftables 'fullcone' expression module + */ + +#define pr_fmt(fmt) "fullcone " KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +#include +#endif + +#include +#include + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +#include +#include +#include +#endif + +#include "nf_nat_fullcone.h" + +/* + * FULLCONE_HKEY generates u32 hash value + * Modified from net/netfilter/ipset/ip_set_hash_gen.h + * dataptr: a pointer + * datatypelen: sizeof(struct blah) or sizeof(u32) + * initval: initial value + * htable_bits: hashtable bits + */ +#define FULLCONE_HKEY(dataptr, datatypelen, initval, htable_bits) \ +({ \ + const u32 *__k = (const u32 *)(dataptr); \ + u32 __l = (datatypelen) / sizeof(u32); \ + \ + BUILD_BUG_ON((datatypelen) % sizeof(u32) != 0); \ + \ + jhash2(__k, __l, (initval)) & jhash_mask((htable_bits)); \ +}) + +#define HASHTABLE_BUCKET_BITS 10 + +/* static variables */ + +static DEFINE_HASHTABLE(mapping_table_by_ext_port, HASHTABLE_BUCKET_BITS); +static DEFINE_HASHTABLE(mapping_table_by_int_src, HASHTABLE_BUCKET_BITS); + +static DEFINE_SPINLOCK(fullconenat_lock); + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +static DEFINE_HASHTABLE(mapping6_table_by_ext_port, HASHTABLE_BUCKET_BITS); +static DEFINE_HASHTABLE(mapping6_table_by_int_src, HASHTABLE_BUCKET_BITS); + +static DEFINE_SPINLOCK(fullconenat6_lock); +#endif + +static LIST_HEAD(dying_tuple_list); +static DEFINE_SPINLOCK(dying_tuple_list_lock); + +/* static variables end */ + +/* forward declaration */ + +#if IS_ENABLED(CONFIG_IPV6) +static int nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, + const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); +#endif + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +/* non-atomic: can only be called serially within lock zones. */ +static char *fullcone_nf_ct_stringify_tuple6(const struct nf_conntrack_tuple + *t); +#endif +/* non-atomic: can only be called serially within lock zones. */ +static char *nf_ct_stringify_tuple(const struct nf_conntrack_tuple *t); + +static __be32 get_device_ip(const struct net_device *dev); + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +static struct nat_mapping6 *allocate_mapping6(const union nf_inet_addr + *int_addr, + const uint16_t int_port, + const uint16_t port, const union nf_inet_addr *addr); +#endif +static struct nat_mapping *allocate_mapping(const __be32 int_addr, + const uint16_t int_port, const uint16_t port, const __be32 addr); + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +static void add_original_tuple_to_mapping6(struct nat_mapping6 *mapping, const struct nf_conntrack_tuple + *original_tuple); +#endif +static void add_original_tuple_to_mapping(struct nat_mapping *mapping, const struct nf_conntrack_tuple + *original_tuple); + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +static struct nat_mapping6 *get_mapping6_by_int_src(const union nf_inet_addr + *src_ip, const uint16_t src_port, const union nf_inet_addr + *ext_ip); +#endif + +static struct nat_mapping *get_mapping_by_int_src(const __be32 src_ip, const uint16_t src_port, const __be32 ext_ip); + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +static struct nat_mapping6 *get_mapping6_by_int_src_inrange(const union + nf_inet_addr + *src_ip, const uint16_t src_port, const union + nf_inet_addr + *min_ip, const union + nf_inet_addr + *max_ip); +#endif +static struct nat_mapping *get_mapping_by_int_src_inrange(const __be32 src_ip, + const uint16_t + src_port, const __be32 min_ip, const __be32 max_ip); + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +static void kill_mapping6(struct nat_mapping6 *mapping); +#endif +static void kill_mapping(struct nat_mapping *mapping); + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) + +/* check if a mapping is valid. + * possibly delete and free an invalid mapping. + * the mapping should not be used anymore after check_mapping6() returns 0. */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static int check_mapping6(struct nat_mapping6 *mapping, struct net *net, const struct nf_conntrack_zone *zone); +#else +static int check_mapping6(struct nat_mapping6 *mapping, struct net *net, const u16 zone); +#endif + +#endif + +/* check if a mapping is valid. + * possibly delete and free an invalid mapping. + * the mapping should not be used anymore after check_mapping() returns 0. */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static int check_mapping(struct nat_mapping *mapping, struct net *net, const struct nf_conntrack_zone *zone); +#else +static int check_mapping(struct nat_mapping *mapping, struct net *net, const u16 zone); +#endif + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static struct nat_mapping6 *get_mapping6_by_ext_port(const uint16_t port, const union nf_inet_addr + *ext_ip, struct net *net, const struct + nf_conntrack_zone *zone); +#else +static struct nat_mapping6 *get_mapping6_by_ext_port(const uint16_t port, const union nf_inet_addr + *ext_ip, struct net *net, const u16 zone); +#endif +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static struct nat_mapping *get_mapping_by_ext_port(const uint16_t port, const __be32 ext_ip, struct net *net, const struct + nf_conntrack_zone *zone); +#else +static struct nat_mapping *get_mapping_by_ext_port(const uint16_t port, + const __be32 ext_ip, struct net *net, const u16 zone); +#endif + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static uint16_t find_appropriate_port6(struct net *net, + const struct nf_conntrack_zone *zone, + const uint16_t original_port, const union nf_inet_addr *ext_ip, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range); + +#else + struct nf_nat_range *range); + +#endif + +#else +static uint16_t find_appropriate_port6(struct net *net, const u16 zone, + const uint16_t original_port, const union nf_inet_addr *ext_ip, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range); + +#else + struct nf_nat_range *range); + +#endif +#endif + +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static uint16_t find_appropriate_port(struct net *net, + const struct nf_conntrack_zone *zone, + const uint16_t original_port, const __be32 ext_ip, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range); +#else + struct nf_nat_range *range); +#endif + +#else +static uint16_t find_appropriate_port(struct net *net, const u16 zone, + const uint16_t original_port, const __be32 ext_ip, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range); +#else + struct nf_nat_range *range); +#endif +#endif + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static void find_leastused_ip6(const struct nf_conntrack_zone *zone, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range, +#else + struct nf_nat_range *range, +#endif + const union nf_inet_addr *src, + const union nf_inet_addr *dst, union nf_inet_addr *var_ipp); +#else +static void find_leastused_ip6(const u16 zone, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range, +#else + struct nf_nat_range *range, +#endif + const union nf_inet_addr *src, + const union nf_inet_addr *dst, union nf_inet_addr *var_ipp); +#endif +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static __be32 find_leastused_ip(const struct nf_conntrack_zone *zone, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range, +#else + struct nf_nat_range *range, +#endif + const __be32 src, const __be32 dst); +#else +static __be32 find_leastused_ip(const u16 zone, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range, +#else + struct nf_nat_range *range, +#endif + const __be32 src, const __be32 dst); +#endif + +/* forward declaration end */ + +/* non-atomic part */ + +static char tuple_tmp_string[512]; + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +/* non-atomic: can only be called serially within lock zones. */ +static char *fullcone_nf_ct_stringify_tuple6(const struct nf_conntrack_tuple *t) +{ + snprintf(tuple_tmp_string, sizeof(tuple_tmp_string), + "[%pI6c]:%hu -> [%pI6c]:%hu", &t->src.u3.ip6, + be16_to_cpu(t->src.u.all), &t->dst.u3.ip6, be16_to_cpu(t->dst.u.all)); + return tuple_tmp_string; +} +#endif +/* non-atomic: can only be called serially within lock zones. */ +static char *nf_ct_stringify_tuple(const struct nf_conntrack_tuple *t) +{ + snprintf(tuple_tmp_string, sizeof(tuple_tmp_string), + "%pI4:%hu -> %pI4:%hu", &t->src.u3.ip, + be16_to_cpu(t->src.u.all), &t->dst.u3.ip, be16_to_cpu(t->dst.u.all)); + return tuple_tmp_string; +} + +/* non-atomic part end */ + +void nf_nat_fullcone_dying_tuple_list_add(struct list_head *new_dying) +{ + spin_lock_bh(&dying_tuple_list_lock); + list_add(new_dying, &dying_tuple_list); + spin_unlock_bh(&dying_tuple_list_lock); +} + +EXPORT_SYMBOL_GPL(nf_nat_fullcone_dying_tuple_list_add); + +static __be32 get_device_ip(const struct net_device *dev) +{ + struct in_device *in_dev; + struct in_ifaddr *if_info; + __be32 result; + + if (dev == NULL) { + return 0; + } + + rcu_read_lock(); + in_dev = dev->ip_ptr; + if (in_dev == NULL) { + rcu_read_unlock(); + return 0; + } + if_info = in_dev->ifa_list; + if (if_info) { + result = if_info->ifa_local; + rcu_read_unlock(); + return result; + } else { + rcu_read_unlock(); + return 0; + } +} + +void nf_nat_fullcone_handle_dying_tuples(void) +{ + struct list_head *iter, *tmp, *iter_2, *tmp_2; + struct tuple_list *item; + struct nf_conntrack_tuple *ct_tuple; + struct nat_mapping *mapping; + __be32 ip, ext_ip; + uint16_t port; + struct nat_mapping_original_tuple *original_tuple_item; +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) + struct nat_mapping6 *mapping6; + union nf_inet_addr *ip6, *ext_ip6; + spin_lock_bh(&fullconenat6_lock); +#endif + + spin_lock_bh(&fullconenat_lock); + spin_lock_bh(&dying_tuple_list_lock); + + list_for_each_safe(iter, tmp, &dying_tuple_list) { + item = list_entry(iter, struct tuple_list, list); + + /* we dont know the conntrack direction for now so we try in both ways. */ + ct_tuple = &(item->tuple_original); +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) + if (ct_tuple->src.l3num == PF_INET6) { + ip6 = &(ct_tuple->src).u3; + port = be16_to_cpu((ct_tuple->src).u.udp.port); + ext_ip6 = &item->tuple_reply.dst.u3; + mapping6 = get_mapping6_by_int_src(ip6, port, ext_ip6); + if (mapping6 == NULL) { + ext_ip6 = &(ct_tuple->dst).u3; + ct_tuple = &(item->tuple_reply); + ip6 = &(ct_tuple->src).u3; + port = be16_to_cpu((ct_tuple->src).u.udp.port); + mapping6 = get_mapping6_by_int_src(ip6, port, ext_ip6); + if (mapping6 != NULL) { + pr_debug + ("nf_nat_fullcone_handle_dying_tuples(): INBOUND dying conntrack at ext port %d\n", + mapping6->port); + } + } else { + pr_debug + ("nf_nat_fullcone_handle_dying_tuples(): OUTBOUND dying conntrack at ext port %d\n", + mapping6->port); + } + + if (mapping6 == NULL) { + goto next; + } + + /* look for the corresponding out-dated tuple and free it */ + list_for_each_safe(iter_2, tmp_2, &mapping6->original_tuple_list) { + original_tuple_item = list_entry(iter_2, struct + nat_mapping_original_tuple, node); + + if (nf_ct_tuple_equal(&original_tuple_item->tuple, &(item->tuple_original))) { + pr_debug + ("nf_nat_fullcone_handle_dying_tuples(): tuple %s expired. free this tuple.\n", + fullcone_nf_ct_stringify_tuple6(&original_tuple_item->tuple)); + list_del(&original_tuple_item->node); + kfree(original_tuple_item); + (mapping6->refer_count)--; + } + } + + /* then kill the mapping if needed */ + pr_debug + ("nf_nat_fullcone_handle_dying_tuples(): refer_count for mapping at ext_port %d is now %d\n", + mapping6->port, mapping6->refer_count); + if (mapping6->refer_count <= 0) { + pr_debug + ("nf_nat_fullcone_handle_dying_tuples(): kill expired mapping at ext port %d\n", + mapping6->port); + kill_mapping6(mapping6); + } + goto next; + } + if (unlikely(ct_tuple->src.l3num != PF_INET)) +#else + if (ct_tuple->src.l3num != PF_INET) +#endif + goto next; + + ip = (ct_tuple->src).u3.ip; + port = be16_to_cpu((ct_tuple->src).u.udp.port); + ext_ip = item->tuple_reply.dst.u3.ip; + mapping = get_mapping_by_int_src(ip, port, ext_ip); + if (mapping == NULL) { + ext_ip = (ct_tuple->dst).u3.ip; + ct_tuple = &(item->tuple_reply); + ip = (ct_tuple->src).u3.ip; + port = be16_to_cpu((ct_tuple->src).u.udp.port); + mapping = get_mapping_by_int_src(ip, port, ext_ip); + if (mapping != NULL) { + pr_debug + ("nf_nat_fullcone_handle_dying_tuples(): INBOUND dying conntrack at ext port %d\n", + mapping->port); + } + } else { + pr_debug + ("nf_nat_fullcone_handle_dying_tuples(): OUTBOUND dying conntrack at ext port %d\n", + mapping->port); + } + + if (mapping == NULL) { + goto next; + } + + /* look for the corresponding out-dated tuple and free it */ + list_for_each_safe(iter_2, tmp_2, &mapping->original_tuple_list) { + original_tuple_item = list_entry(iter_2, struct nat_mapping_original_tuple, node); + + if (nf_ct_tuple_equal(&original_tuple_item->tuple, &(item->tuple_original))) { + pr_debug + ("nf_nat_fullcone_handle_dying_tuples(): tuple %s expired. free this tuple.\n", + nf_ct_stringify_tuple(&original_tuple_item->tuple)); + list_del(&original_tuple_item->node); + kfree(original_tuple_item); + (mapping->refer_count)--; + } + } + + /* then kill the mapping if needed */ + pr_debug + ("nf_nat_fullcone_handle_dying_tuples(): refer_count for mapping at ext_port %d is now %d\n", + mapping->port, mapping->refer_count); + if (mapping->refer_count <= 0) { + pr_debug + ("nf_nat_fullcone_handle_dying_tuples(): kill expired mapping at ext port %d\n", + mapping->port); + kill_mapping(mapping); + } + +next: + list_del(&item->list); + kfree(item); + } + + spin_unlock_bh(&dying_tuple_list_lock); + spin_unlock_bh(&fullconenat_lock); +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) + spin_unlock_bh(&fullconenat6_lock); +#endif +} + +EXPORT_SYMBOL_GPL(nf_nat_fullcone_handle_dying_tuples); + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +static struct nat_mapping6 *allocate_mapping6(const union nf_inet_addr + *int_addr, + const uint16_t int_port, + const uint16_t port, const union nf_inet_addr *addr) +{ + struct nat_mapping6 *p_new; + u32 hash_src; + + p_new = kmalloc(sizeof(struct nat_mapping6), GFP_ATOMIC); + if (p_new == NULL) { + pr_err("kmalloc() for allocate_mapping6 failed.\n"); + return NULL; + } + p_new->addr = *addr; + p_new->port = port; + p_new->int_addr = *int_addr; + p_new->int_port = int_port; + p_new->refer_count = 0; + (p_new->original_tuple_list).next = &(p_new->original_tuple_list); + (p_new->original_tuple_list).prev = &(p_new->original_tuple_list); + + hash_src = FULLCONE_HKEY(int_addr, sizeof(union nf_inet_addr), (u32) int_port, HASHTABLE_BUCKET_BITS); + //hash_src = jhash2((u32 *) int_addr->all, 4, (u32) int_port); + + hash_add(mapping6_table_by_ext_port, &p_new->node_by_ext_port, port); + hash_add(mapping6_table_by_int_src, &p_new->node_by_int_src, hash_src); + + pr_debug("new mapping allocated for [%pI6c]:%d ==> [%pI6c]:%d\n", + &p_new->int_addr, p_new->int_port, &p_new->addr, p_new->port); + + return p_new; +} +#endif +static struct nat_mapping *allocate_mapping(const __be32 int_addr, + const uint16_t int_port, const uint16_t port, const __be32 addr) +{ + struct nat_mapping *p_new; + u32 hash_src; + + p_new = kmalloc(sizeof(struct nat_mapping), GFP_ATOMIC); + if (p_new == NULL) { + pr_err("kmalloc() for allocate_mapping failed.\n"); + return NULL; + } + p_new->addr = addr; + p_new->port = port; + p_new->int_addr = int_addr; + p_new->int_port = int_port; + p_new->refer_count = 0; + (p_new->original_tuple_list).next = &(p_new->original_tuple_list); + (p_new->original_tuple_list).prev = &(p_new->original_tuple_list); + + hash_src = FULLCONE_HKEY(&int_addr, sizeof(__be32), (u32) int_port, HASHTABLE_BUCKET_BITS); + //hash_src = HASH_2(int_addr, (u32) int_port); + + hash_add(mapping_table_by_ext_port, &p_new->node_by_ext_port, port); + hash_add(mapping_table_by_int_src, &p_new->node_by_int_src, hash_src); + + pr_debug("new mapping allocated for %pI4:%d ==> %pI4:%d\n", + &p_new->int_addr, p_new->int_port, &p_new->addr, p_new->port); + + return p_new; +} + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +static void add_original_tuple_to_mapping6(struct nat_mapping6 *mapping, const struct nf_conntrack_tuple + *original_tuple) +{ + struct nat_mapping_original_tuple *item = kmalloc(sizeof(struct nat_mapping_original_tuple), GFP_ATOMIC); + if (item == NULL) { + pr_err("kmalloc() for add_original_tuple_to_mapping6 failed.\n"); + return; + } + memcpy(&item->tuple, original_tuple, sizeof(struct nf_conntrack_tuple)); + list_add(&item->node, &mapping->original_tuple_list); + (mapping->refer_count)++; +} +#endif +static void add_original_tuple_to_mapping(struct nat_mapping *mapping, const struct nf_conntrack_tuple + *original_tuple) +{ + struct nat_mapping_original_tuple *item = kmalloc(sizeof(struct nat_mapping_original_tuple), GFP_ATOMIC); + if (item == NULL) { + pr_err("kmalloc() for add_original_tuple_to_mapping failed.\n"); + return; + } + memcpy(&item->tuple, original_tuple, sizeof(struct nf_conntrack_tuple)); + list_add(&item->node, &mapping->original_tuple_list); + (mapping->refer_count)++; +} + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +static struct nat_mapping6 *get_mapping6_by_int_src(const union nf_inet_addr + *src_ip, const uint16_t src_port, const union nf_inet_addr + *ext_ip) +{ + struct nat_mapping6 *p_current; + u32 hash_src = FULLCONE_HKEY(src_ip, sizeof(union nf_inet_addr), (u32) src_port, HASHTABLE_BUCKET_BITS); + //u32 hash_src = jhash2((u32 *) src_ip->all, 4, (u32) src_port); + + hash_for_each_possible(mapping6_table_by_int_src, p_current, node_by_int_src, hash_src) { + if (nf_inet_addr_cmp(&p_current->int_addr, src_ip) + && p_current->int_port == src_port && nf_inet_addr_cmp(&p_current->addr, ext_ip)) { + return p_current; + } + } + + return NULL; +} +#endif + +static struct nat_mapping *get_mapping_by_int_src(const __be32 src_ip, const uint16_t src_port, const __be32 ext_ip) +{ + struct nat_mapping *p_current; + u32 hash_src = FULLCONE_HKEY(&src_ip, sizeof(__be32), (u32) src_port, HASHTABLE_BUCKET_BITS); + //u32 hash_src = HASH_2(src_ip, (u32) src_port); + + hash_for_each_possible(mapping_table_by_int_src, p_current, node_by_int_src, hash_src) { + if (p_current->int_addr == src_ip && p_current->int_port == src_port && p_current->addr == ext_ip) { + return p_current; + } + } + + return NULL; +} + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +static struct nat_mapping6 *get_mapping6_by_int_src_inrange(const union + nf_inet_addr + *src_ip, const uint16_t src_port, const union + nf_inet_addr + *min_ip, const union + nf_inet_addr + *max_ip) +{ + struct nat_mapping6 *p_current; + + u32 hash_src = FULLCONE_HKEY(src_ip, sizeof(union nf_inet_addr), (u32) src_port, HASHTABLE_BUCKET_BITS); + //u32 hash_src = jhash2((u32 *) src_ip->all, 4, (u32) src_port); + + hash_for_each_possible(mapping6_table_by_int_src, p_current, node_by_int_src, hash_src) { + if (nf_inet_addr_cmp(&p_current->int_addr, src_ip) + && p_current->int_port == src_port + && memcmp(&p_current->addr, min_ip, + sizeof(union nf_inet_addr)) >= 0 + && memcmp(&p_current->addr, max_ip, sizeof(union nf_inet_addr)) <= 0) { + return p_current; + } + } + + return NULL; +} +#endif +static struct nat_mapping *get_mapping_by_int_src_inrange(const __be32 src_ip, + const uint16_t + src_port, const __be32 min_ip, const __be32 max_ip) +{ + struct nat_mapping *p_current; + u32 hash_src = FULLCONE_HKEY(&src_ip, sizeof(__be32), (u32) src_port, HASHTABLE_BUCKET_BITS); + //u32 hash_src = HASH_2(src_ip, (u32) src_port); + + hash_for_each_possible(mapping_table_by_int_src, p_current, node_by_int_src, hash_src) { + if (p_current->int_addr == src_ip + && p_current->int_port == src_port + && memcmp(&p_current->addr, &min_ip, sizeof(__be32)) >= 0 + && memcmp(&p_current->addr, &max_ip, sizeof(__be32)) <= 0) { + return p_current; + } + } + + return NULL; +} + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +static void kill_mapping6(struct nat_mapping6 *mapping) +{ + struct list_head *iter, *tmp; + struct nat_mapping_original_tuple *original_tuple_item; + + if (mapping == NULL) { + return; + } + + list_for_each_safe(iter, tmp, &mapping->original_tuple_list) { + original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node); + list_del(&original_tuple_item->node); + kfree(original_tuple_item); + } + + hash_del(&mapping->node_by_ext_port); + hash_del(&mapping->node_by_int_src); + kfree(mapping); +} +#endif +static void kill_mapping(struct nat_mapping *mapping) +{ + struct list_head *iter, *tmp; + struct nat_mapping_original_tuple *original_tuple_item; + + if (mapping == NULL) { + return; + } + + list_for_each_safe(iter, tmp, &mapping->original_tuple_list) { + original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node); + list_del(&original_tuple_item->node); + kfree(original_tuple_item); + } + + hash_del(&mapping->node_by_ext_port); + hash_del(&mapping->node_by_int_src); + kfree(mapping); +} + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) + +/* check if a mapping is valid. + * possibly delete and free an invalid mapping. + * the mapping should not be used anymore after check_mapping6() returns 0. */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static int check_mapping6(struct nat_mapping6 *mapping, struct net *net, const struct nf_conntrack_zone *zone) +{ +#else +static int check_mapping6(struct nat_mapping6 *mapping, struct net *net, const u16 zone) +{ +#endif + struct list_head *iter, *tmp; + struct nat_mapping_original_tuple *original_tuple_item; + struct nf_conntrack_tuple_hash *tuple_hash; + struct nf_conn *ct; + + /* for dying/unconfirmed conntrack tuples, an IPCT_DESTROY event may NOT be fired. + * so we manually kill one of those tuples once we acquire one. */ + + list_for_each_safe(iter, tmp, &mapping->original_tuple_list) { + original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node); + + tuple_hash = nf_conntrack_find_get(net, zone, &original_tuple_item->tuple); + + if (tuple_hash == NULL) { + pr_debug + ("check_mapping6(): tuple %s dying/unconfirmed. free this tuple.\n", + fullcone_nf_ct_stringify_tuple6(&original_tuple_item->tuple)); + + list_del(&original_tuple_item->node); + kfree(original_tuple_item); + (mapping->refer_count)--; + } else { + ct = nf_ct_tuplehash_to_ctrack(tuple_hash); + if (likely(ct != NULL)) + nf_ct_put(ct); + } + + } + + /* kill the mapping if need */ + pr_debug + ("check_mapping6() refer_count for mapping at ext_port %d is now %d\n", + mapping->port, mapping->refer_count); + if (mapping->refer_count <= 0) { + pr_debug("check_mapping6(): kill dying/unconfirmed mapping at ext port %d\n", mapping->port); + kill_mapping6(mapping); + return 0; + } else { + return 1; + } +} + +#endif + +/* check if a mapping is valid. + * possibly delete and free an invalid mapping. + * the mapping should not be used anymore after check_mapping() returns 0. */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static int check_mapping(struct nat_mapping *mapping, struct net *net, const struct nf_conntrack_zone *zone) +{ +#else +static int check_mapping(struct nat_mapping *mapping, struct net *net, const u16 zone) +{ +#endif + struct list_head *iter, *tmp; + struct nat_mapping_original_tuple *original_tuple_item; + struct nf_conntrack_tuple_hash *tuple_hash; + struct nf_conn *ct; + + /* for dying/unconfirmed conntrack tuples, an IPCT_DESTROY event may NOT be fired. + * so we manually kill one of those tuples once we acquire one. */ + + list_for_each_safe(iter, tmp, &mapping->original_tuple_list) { + original_tuple_item = list_entry(iter, struct nat_mapping_original_tuple, node); + + tuple_hash = nf_conntrack_find_get(net, zone, &original_tuple_item->tuple); + + if (tuple_hash == NULL) { + pr_debug + ("check_mapping(): tuple %s dying/unconfirmed. free this tuple.\n", + nf_ct_stringify_tuple(&original_tuple_item->tuple)); + + list_del(&original_tuple_item->node); + kfree(original_tuple_item); + (mapping->refer_count)--; + } else { + ct = nf_ct_tuplehash_to_ctrack(tuple_hash); + if (likely(ct != NULL)) + nf_ct_put(ct); + } + + } + + /* kill the mapping if need */ + pr_debug + ("check_mapping() refer_count for mapping at ext_port %d is now %d\n", mapping->port, mapping->refer_count); + if (mapping->refer_count <= 0) { + pr_debug("check_mapping(): kill dying/unconfirmed mapping at ext port %d\n", mapping->port); + kill_mapping(mapping); + return 0; + } else { + return 1; + } +} + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static struct nat_mapping6 *get_mapping6_by_ext_port(const uint16_t port, const union nf_inet_addr + *ext_ip, struct net *net, const struct + nf_conntrack_zone *zone) +{ +#else +static struct nat_mapping6 *get_mapping6_by_ext_port(const uint16_t port, const union nf_inet_addr + *ext_ip, struct net *net, const u16 zone) +{ +#endif + struct nat_mapping6 *p_current; + struct hlist_node *tmp; + + hash_for_each_possible_safe(mapping6_table_by_ext_port, p_current, tmp, node_by_ext_port, port) { + if (p_current->port == port && check_mapping6(p_current, net, zone) + && nf_inet_addr_cmp(&p_current->addr, ext_ip)) { + return p_current; + } + } + + return NULL; +} +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static struct nat_mapping *get_mapping_by_ext_port(const uint16_t port, const __be32 ext_ip, struct net *net, const struct + nf_conntrack_zone *zone) +{ +#else +static struct nat_mapping *get_mapping_by_ext_port(const uint16_t port, + const __be32 ext_ip, struct net *net, const u16 zone) +{ +#endif + struct nat_mapping *p_current; + struct hlist_node *tmp; + + hash_for_each_possible_safe(mapping_table_by_ext_port, p_current, tmp, node_by_ext_port, port) { + if (p_current->port == port && check_mapping(p_current, net, zone) + && p_current->addr == ext_ip) { + return p_current; + } + } + + return NULL; +} + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static uint16_t find_appropriate_port6(struct net *net, + const struct nf_conntrack_zone *zone, + const uint16_t original_port, const union nf_inet_addr *ext_ip, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range) +#else + struct nf_nat_range *range) +#endif +#else +static uint16_t find_appropriate_port6(struct net *net, const u16 zone, + const uint16_t original_port, const union nf_inet_addr *ext_ip, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range) +#else + struct nf_nat_range *range) +#endif +#endif +{ + uint16_t min, start, selected, range_size, i; + struct nat_mapping6 *mapping = NULL; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + // nf_nat_range2 specific + memset(&range->base_proto, 0, sizeof(range->base_proto)); +#endif + if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { + min = be16_to_cpu((range->min_proto).udp.port); + range_size = be16_to_cpu((range->max_proto).udp.port) - min + 1; + } else { + /* minimum port is 1024. same behavior as default linux NAT. */ + min = 1024; + range_size = 65535 - min + 1; + } + + if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM) + || (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY)) { + /* for now we do the same thing for both --random and --random-fully */ + + /* select a random starting point */ + start = (uint16_t) (prandom_u32() % (u32) range_size); + } else { + + if ((original_port >= min && original_port <= min + range_size - 1) + || !(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { + /* 1. try to preserve the port if it's available */ + mapping = get_mapping6_by_ext_port(original_port, ext_ip, net, zone); + if (mapping == NULL) { + return original_port; + } + } + + /* otherwise, we start from zero */ + start = 0; + } + + for (i = 0; i < range_size; i++) { + /* 2. try to find an available port */ + selected = min + ((start + i) % range_size); + mapping = get_mapping6_by_ext_port(selected, ext_ip, net, zone); + if (mapping == NULL) { + return selected; + } + } + + /* 3. at least we tried. override a previous mapping. */ + selected = min + start; + mapping = get_mapping6_by_ext_port(selected, ext_ip, net, zone); + kill_mapping6(mapping); + + return selected; +} +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static uint16_t find_appropriate_port(struct net *net, + const struct nf_conntrack_zone *zone, + const uint16_t original_port, const __be32 ext_ip, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range) +#else + struct nf_nat_range *range) +#endif +#else +static uint16_t find_appropriate_port(struct net *net, const u16 zone, + const uint16_t original_port, const __be32 ext_ip, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range) +#else + struct nf_nat_range *range) +#endif +#endif +{ + uint16_t min, start, selected, range_size, i; + struct nat_mapping *mapping = NULL; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + // nf_nat_range2 specific + memset(&range->base_proto, 0, sizeof(range->base_proto)); +#endif + if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { + min = be16_to_cpu((range->min_proto).udp.port); + range_size = be16_to_cpu((range->max_proto).udp.port) - min + 1; + } else { + /* minimum port is 1024. same behavior as default linux NAT. */ + min = 1024; + range_size = 65535 - min + 1; + } + + if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM) + || (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY)) { + /* for now we do the same thing for both --random and --random-fully */ + + /* select a random starting point */ + start = (uint16_t) (prandom_u32() % (u32) range_size); + } else { + + if ((original_port >= min && original_port <= min + range_size - 1) + || !(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { + /* 1. try to preserve the port if it's available */ + mapping = get_mapping_by_ext_port(original_port, ext_ip, net, zone); + if (mapping == NULL) { + return original_port; + } + } + + /* otherwise, we start from zero */ + start = 0; + } + + for (i = 0; i < range_size; i++) { + /* 2. try to find an available port */ + selected = min + ((start + i) % range_size); + mapping = get_mapping_by_ext_port(selected, ext_ip, net, zone); + if (mapping == NULL) { + return selected; + } + } + + /* 3. at least we tried. override a previous mapping. */ + selected = min + start; + mapping = get_mapping_by_ext_port(selected, ext_ip, net, zone); + kill_mapping(mapping); + + return selected; +} + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static void find_leastused_ip6(const struct nf_conntrack_zone *zone, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range, +#else + struct nf_nat_range *range, +#endif + const union nf_inet_addr *src, + const union nf_inet_addr *dst, union nf_inet_addr *var_ipp) +#else +static void find_leastused_ip6(const u16 zone, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range, +#else + struct nf_nat_range *range, +#endif + const union nf_inet_addr *src, + const union nf_inet_addr *dst, union nf_inet_addr *var_ipp) +#endif +{ + unsigned int i; + /* Host order */ + u32 minip, maxip, j, dist; + bool full_range; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + // nf_nat_range2 specific + memset(&(range->base_proto), 0, sizeof(range->base_proto)); +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) + j = FULLCONE_HKEY(src, sizeof(union nf_inet_addr), + range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst->all[3] ^ zone->id, HASHTABLE_BUCKET_BITS); + //j = jhash2((u32 *) src, 4, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst->all[3] ^ zone->id); +#else + j = FULLCONE_HKEY(src, sizeof(union nf_inet_addr), + range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst->all[3] ^ zone, HASHTABLE_BUCKET_BITS); + //j = jhash2((u32 *) src, 4, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst->all[3] ^ zone); +#endif + + full_range = false; + for (i = 0; i <= 3; i++) { + /* If first bytes of the address are at the maximum, use the + * distance. Otherwise use the full range. */ + if (!full_range) { + minip = ntohl(range->min_addr.all[i]); + maxip = ntohl(range->max_addr.all[i]); + dist = maxip - minip + 1; + } else { + minip = 0; + dist = ~0; + } + + var_ipp->all[i] = (__force __be32) htonl(minip + reciprocal_scale(j, dist)); + if (var_ipp->all[i] != range->max_addr.all[i]) + full_range = true; + + if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) + j ^= (__force u32) dst->all[i]; + } +} +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) +static __be32 find_leastused_ip(const struct nf_conntrack_zone *zone, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range, +#else + struct nf_nat_range *range, +#endif + const __be32 src, const __be32 dst) +#else +static __be32 find_leastused_ip(const u16 zone, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range, +#else + struct nf_nat_range *range, +#endif + const __be32 src, const __be32 dst) +#endif +{ + /* Host order */ + u32 minip, maxip, j, dist; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + // nf_nat_range2 specific + memset(&(range->base_proto), 0, sizeof(range->base_proto)); +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) + j = FULLCONE_HKEY(&src, sizeof(__be32), range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst ^ zone->id, + HASHTABLE_BUCKET_BITS); + //j = jhash_1word((u32) src, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst ^ zone->id); +#else + j = FULLCONE_HKEY(&src, sizeof(__be32), range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst ^ zone, + HASHTABLE_BUCKET_BITS); + //j = jhash_1word((u32) src, range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : dst ^ zone); +#endif + + minip = ntohl(range->min_addr.ip); + maxip = ntohl(range->max_addr.ip); + dist = maxip - minip + 1; + + return (__be32) htonl(minip + reciprocal_scale(j, dist)); +} + +void nf_nat_fullcone_destroy_mappings(void) +{ + struct nat_mapping *p_current; +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) + struct nat_mapping6 *p6_current; +#endif + struct hlist_node *tmp; + int i; + + spin_lock_bh(&fullconenat_lock); + + hash_for_each_safe(mapping_table_by_ext_port, i, tmp, p_current, node_by_ext_port) { + kill_mapping(p_current); + } + + spin_unlock_bh(&fullconenat_lock); + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) + spin_lock_bh(&fullconenat6_lock); + + hash_for_each_safe(mapping6_table_by_ext_port, i, tmp, p6_current, node_by_ext_port) { + kill_mapping6(p6_current); + } + + spin_unlock_bh(&fullconenat6_lock); +#endif +} + +EXPORT_SYMBOL_GPL(nf_nat_fullcone_destroy_mappings); + +/* + * nfproto choices + * enum { + NFPROTO_INET = 1, + NFPROTO_IPV4 = 2, + NFPROTO_IPV6 = 10, +}; + */ +static unsigned int nf_nat_handle_prerouting(u8 nfproto, struct sk_buff *skb, unsigned int hooknum, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *newrange) +#else + struct nf_nat_range *newrange) +#endif +{ + unsigned int ret; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) + const struct nf_conntrack_zone *zone; +#else + u16 zone; +#endif + struct net *net; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conntrack_tuple *ct_tuple_origin; + + uint16_t port, original_port; + uint8_t protonum; + +/* NFPROTO specific def */ + struct nat_mapping *mapping; + struct nat_mapping6 *mapping_6; + + __be32 ip; + union nf_inet_addr *ip_6; + /* NFPROTO specific def end */ + + WARN_ON(!(nfproto == NFPROTO_IPV4 || nfproto == NFPROTO_IPV6)); + + /* NFPROTO specific init */ + mapping = NULL; + mapping_6 = NULL; + + ip = 0; + ip_6 = NULL; + /* NFPROTO specific init end */ + + original_port = 0; + ret = NFT_CONTINUE; // BUG: use XT_CONTINUE for Xtables + + ct = nf_ct_get(skb, &ctinfo); + net = nf_ct_net(ct); + zone = nf_ct_zone(ct); + + ct_tuple_origin = &(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + + protonum = (ct_tuple_origin->dst).protonum; + if (protonum != IPPROTO_UDP) { + // Currently only UDP traffic is supported for full-cone NAT. + // For other protos FULLCONENAT is equivalent to MASQUERADE. + return ret; + } + + if (nfproto == NFPROTO_IPV4) { + ip = (ct_tuple_origin->dst).u3.ip; + } else if (nfproto == NFPROTO_IPV6) { + ip_6 = &(ct_tuple_origin->dst).u3; + } + + port = be16_to_cpu((ct_tuple_origin->dst).u.udp.port); + + if (nfproto == NFPROTO_IPV4) { + spin_lock_bh(&fullconenat_lock); + } else if (nfproto == NFPROTO_IPV6) { + spin_lock_bh(&fullconenat6_lock); + } + + /* find an active mapping based on the inbound port */ + if (nfproto == NFPROTO_IPV4) { + mapping = get_mapping_by_ext_port(port, ip, net, zone); + } else if (nfproto == NFPROTO_IPV6) { + mapping_6 = get_mapping6_by_ext_port(port, ip_6, net, zone); + } + + if (nfproto == NFPROTO_IPV4) { + if (mapping == NULL) { + goto unlock; + } + } else if (nfproto == NFPROTO_IPV6) { + if (mapping_6 == NULL) { + goto unlock; + } + } + + newrange->flags = NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED; + if (nfproto == NFPROTO_IPV4) { + newrange->min_addr.ip = mapping->int_addr; + newrange->max_addr.ip = mapping->int_addr; + } else if (nfproto == NFPROTO_IPV6) { + newrange->min_addr = mapping_6->int_addr; + newrange->max_addr = mapping_6->int_addr; + } + + newrange->min_proto.udp.port = cpu_to_be16(mapping->int_port); + newrange->max_proto = newrange->min_proto; + + if (nfproto == NFPROTO_IPV4) { + pr_debug(" %s ==> %pI4:%d\n", + nf_ct_stringify_tuple(ct_tuple_origin), &mapping->int_addr, mapping->int_port); + } else if (nfproto == NFPROTO_IPV6) { + pr_debug(" %s ==> [%pI6c]:%d\n", + fullcone_nf_ct_stringify_tuple6(ct_tuple_origin), &mapping_6->int_addr, mapping_6->int_port); + } + + ret = nf_nat_setup_info(ct, newrange, HOOK2MANIP(hooknum)); + + if (ret == NF_ACCEPT) { + if (nfproto == NFPROTO_IPV4) { + add_original_tuple_to_mapping(mapping, ct_tuple_origin); + pr_debug + ("INBOUND: refer_count for mapping at ext_port %d is now %d\n", + mapping->port, mapping->refer_count); + } else if (nfproto == NFPROTO_IPV6) { + add_original_tuple_to_mapping6(mapping_6, ct_tuple_origin); + pr_debug + ("INBOUND: refer_count for mapping_6 at ext_port %d is now %d\n", + mapping_6->port, mapping_6->refer_count); + } + + } + +unlock: + if (nfproto == NFPROTO_IPV4) { + spin_unlock_bh(&fullconenat_lock); + } else if (nfproto == NFPROTO_IPV6) { + spin_unlock_bh(&fullconenat6_lock); + } + + return ret; + +} + +static unsigned int nf_nat_handle_postrouting(u8 nfproto, struct sk_buff *skb, unsigned int hooknum, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range, struct nf_nat_range2 *newrange, +#else + struct nf_nat_range *range, struct nf_nat_range *newrange, +#endif + const struct net_device *out) +{ + unsigned int ret; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) + const struct nf_conntrack_zone *zone; +#else + u16 zone; +#endif + struct net *net; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conn_nat *nat; + struct nf_conntrack_tuple *ct_tuple, *ct_tuple_origin; + uint16_t port, original_port, want_port; + uint8_t protonum; + bool is_src_mapping_active; + + /* NFPROTO specific def */ + struct nat_mapping *mapping, *src_mapping; + struct nat_mapping6 *mapping_6, *src_mapping_6; + + __be32 ip; + union nf_inet_addr *ip_6; + /* NFPROTO specific def end */ + + WARN_ON(!(nfproto == NFPROTO_IPV4 || nfproto == NFPROTO_IPV6)); + + /* NFPROTO specific init */ + mapping = NULL; + src_mapping = NULL; + mapping_6 = NULL; + src_mapping_6 = NULL; + + ip = 0; + ip_6 = NULL; + /* NFPROTO specific init end */ + + original_port = 0; + ret = NFT_CONTINUE; // BUG: use XT_CONTINUE for Xtables + + ct = nf_ct_get(skb, &ctinfo); + net = nf_ct_net(ct); + zone = nf_ct_zone(ct); + + ct_tuple_origin = &(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + protonum = (ct_tuple_origin->dst).protonum; + + if (range->flags & NF_NAT_RANGE_MAP_IPS) { + if (nfproto == NFPROTO_IPV4) { + newrange->min_addr.ip = range->min_addr.ip; + newrange->max_addr.ip = range->max_addr.ip; + } else if (nfproto == NFPROTO_IPV6) { + newrange->min_addr = range->min_addr; + newrange->max_addr = range->max_addr; + } + + } else { + if (nfproto == NFPROTO_IPV4) { + newrange->min_addr.ip = get_device_ip(skb->dev); + if (unlikely(!newrange->min_addr.ip)) + return NF_DROP; + newrange->max_addr.ip = newrange->min_addr.ip; + } else if (nfproto == NFPROTO_IPV6) { + if (unlikely + (nat_ipv6_dev_get_saddr + (nf_ct_net(ct), out, &ipv6_hdr(skb)->daddr, 0, &(newrange->min_addr.in6)) < 0)) + return NF_DROP; + newrange->max_addr = newrange->min_addr; + + } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) + nat = nf_ct_nat_ext_add(ct); +#else + nat = nfct_nat(ct); +#endif + if (likely(nat)) + nat->masq_index = out->ifindex; + + } + + if (protonum == IPPROTO_UDP) { + if (nfproto == NFPROTO_IPV4) { + ip = (ct_tuple_origin->src).u3.ip; + } else if (nfproto == NFPROTO_IPV6) { + ip_6 = &(ct_tuple_origin->src).u3; + } + + original_port = be16_to_cpu((ct_tuple_origin->src).u.udp.port); + + if (nfproto == NFPROTO_IPV4) { + spin_lock_bh(&fullconenat_lock); + } else if (nfproto == NFPROTO_IPV6) { + spin_lock_bh(&fullconenat6_lock); + } + + if (nfproto == NFPROTO_IPV4) { + if (newrange->min_addr.ip != newrange->max_addr.ip) + src_mapping = + get_mapping_by_int_src_inrange(ip, + original_port, + newrange->min_addr.ip, newrange->max_addr.ip); + else + src_mapping = get_mapping_by_int_src(ip, original_port, newrange->min_addr.ip); + } else if (nfproto == NFPROTO_IPV6) { + if (!nf_inet_addr_cmp(&newrange->min_addr, &newrange->max_addr)) + src_mapping_6 = + get_mapping6_by_int_src_inrange(ip_6, + original_port, + &newrange->min_addr, &newrange->max_addr); + else + src_mapping_6 = get_mapping6_by_int_src(ip_6, original_port, &newrange->min_addr); + } + + if (nfproto == NFPROTO_IPV4) { + is_src_mapping_active = src_mapping != NULL && check_mapping(src_mapping, net, zone); + } else if (nfproto == NFPROTO_IPV6) { + is_src_mapping_active = src_mapping_6 != NULL && check_mapping6(src_mapping_6, net, zone); + } + + if (is_src_mapping_active) { + + /* outbound nat: if a previously established mapping is active, + * we will reuse that mapping. */ + + newrange->flags = NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED; + if (nfproto == NFPROTO_IPV4) { + newrange->min_proto.udp.port = cpu_to_be16(src_mapping->port); + } else if (nfproto == NFPROTO_IPV6) { + newrange->min_proto.udp.port = cpu_to_be16(src_mapping_6->port); + } + + newrange->max_proto = newrange->min_proto; + + if (nfproto == NFPROTO_IPV4) { + if (newrange->min_addr.ip != newrange->max_addr.ip) { + newrange->min_addr.ip = src_mapping->addr; + newrange->max_addr.ip = newrange->min_addr.ip; + } + } else if (nfproto == NFPROTO_IPV6) { + if (!nf_inet_addr_cmp(&newrange->min_addr, &newrange->max_addr)) { + newrange->min_addr = src_mapping_6->addr; + newrange->max_addr = newrange->min_addr; + } + } + + } else { + + /* if not, we find a new external IP:port to map to. + * the SNAT may fail so we should re-check the mapped port later. */ + + if (nfproto == NFPROTO_IPV4) { + if (newrange->min_addr.ip != newrange->max_addr.ip) { + newrange->min_addr.ip = + find_leastused_ip(zone, range, ip, (ct_tuple_origin->dst).u3.ip); + newrange->max_addr.ip = newrange->min_addr.ip; + } + want_port = + find_appropriate_port(net, zone, original_port, newrange->min_addr.ip, range); + } else if (nfproto == NFPROTO_IPV6) { + + if (!nf_inet_addr_cmp(&newrange->min_addr, &newrange->max_addr)) { + find_leastused_ip6(zone, range, ip_6, + &(ct_tuple_origin->dst).u3, &newrange->min_addr); + newrange->max_addr = newrange->min_addr; + } + + want_port = + find_appropriate_port6(net, zone, original_port, &newrange->min_addr, range); + } + + newrange->flags = NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED; + newrange->min_proto.udp.port = cpu_to_be16(want_port); + newrange->max_proto = newrange->min_proto; + + if (nfproto == NFPROTO_IPV4) { + src_mapping = NULL; + } else if (nfproto == NFPROTO_IPV6) { + src_mapping_6 = NULL; + } + + } + } + + /* do SNAT now */ + ret = nf_nat_setup_info(ct, newrange, HOOK2MANIP(hooknum)); + + if (protonum != IPPROTO_UDP) { + /* non-UDP packets, bailout */ + goto out; + } + if (ret != NF_ACCEPT) { + /* failed SNAT, bailout */ + goto unlock; + } + + /* the reply tuple contains the mapped port. */ + ct_tuple = &(ct->tuplehash[IP_CT_DIR_REPLY].tuple); + /* this is the resulted mapped port. */ + port = be16_to_cpu((ct_tuple->dst).u.udp.port); + + if (nfproto == NFPROTO_IPV4) { + pr_debug(" %s ==> %d\n", nf_ct_stringify_tuple(ct_tuple_origin), port); + } else if (nfproto == NFPROTO_IPV6) { + pr_debug(" %s ==> %d\n", fullcone_nf_ct_stringify_tuple6(ct_tuple_origin), port); + } + + /* save the mapping information into our mapping table */ + + if (nfproto == NFPROTO_IPV4) { + mapping = src_mapping; + if (mapping == NULL) { + mapping = allocate_mapping(ip, original_port, port, (ct_tuple->dst).u3.ip); + } + if (likely(mapping != NULL)) { + add_original_tuple_to_mapping(mapping, ct_tuple_origin); + pr_debug + (" OUTBOUND: refer_count for mapping at ext_port %d is now %d\n", + mapping->port, mapping->refer_count); + } + } else if (nfproto == NFPROTO_IPV6) { + mapping_6 = src_mapping_6; + if (mapping_6 == NULL) { + mapping_6 = allocate_mapping6(ip_6, original_port, port, &(ct_tuple->dst).u3); + } + if (likely(mapping_6 != NULL)) { + add_original_tuple_to_mapping6(mapping_6, ct_tuple_origin); + pr_debug + ("OUTBOUND: refer_count for mapping at ext_port %d is now %d\n", + mapping_6->port, mapping_6->refer_count); + } + } + +unlock: + if (nfproto == NFPROTO_IPV4) { + spin_unlock_bh(&fullconenat_lock); + } else if (nfproto == NFPROTO_IPV6) { + spin_unlock_bh(&fullconenat6_lock); + } + +out: + return ret; +} + +unsigned int nf_nat_fullcone_ipv4(struct sk_buff *skb, unsigned int hooknum, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range, +#else + struct nf_nat_range *range, +#endif + const struct net_device *out) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 newrange; +#else + struct nf_nat_range newrange; +#endif + + WARN_ON(!(hooknum == NF_INET_POST_ROUTING || hooknum == NF_INET_PRE_ROUTING)); + + memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); + memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + switch (hooknum) { + case NF_INET_PRE_ROUTING: + /* inbound packets */ + return nf_nat_handle_prerouting(NFPROTO_IPV4, skb, hooknum, &newrange); + case NF_INET_POST_ROUTING: + /* outbound packets */ + return nf_nat_handle_postrouting(NFPROTO_IPV4, skb, hooknum, range, &newrange, out); + } + + WARN_ON(1); + // logical error + return 5; +} + +EXPORT_SYMBOL_GPL(nf_nat_fullcone_ipv4); + +#if IS_ENABLED(CONFIG_IPV6) +static int +nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, + const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr) +{ +#ifdef CONFIG_IPV6_MODULE + const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); + + if (!v6_ops) + return -EHOSTUNREACH; + + return v6_ops->dev_get_saddr(net, dev, daddr, srcprefs, saddr); +#else + return ipv6_dev_get_saddr(net, dev, daddr, srcprefs, saddr); +#endif +} +#endif + +unsigned int nf_nat_fullcone_ipv6(struct sk_buff *skb, unsigned int hooknum, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range, +#else + struct nf_nat_range *range, +#endif + const struct net_device *out) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 newrange; +#else + struct nf_nat_range newrange; +#endif + + WARN_ON(!(hooknum == NF_INET_POST_ROUTING || hooknum == NF_INET_PRE_ROUTING)); + + memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); + memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + switch (hooknum) { + case NF_INET_PRE_ROUTING: + /* inbound packets */ + return nf_nat_handle_prerouting(NFPROTO_IPV6, skb, hooknum, &newrange); + case NF_INET_POST_ROUTING: + /* outbound packets */ + return nf_nat_handle_postrouting(NFPROTO_IPV6, skb, hooknum, range, &newrange, out); + } + + WARN_ON(1); + // logical error + return 5; +} + +EXPORT_SYMBOL_GPL(nf_nat_fullcone_ipv6); diff --git a/src/nf_nat_fullcone.h b/src/nf_nat_fullcone.h new file mode 100644 index 0000000..289ce5c --- /dev/null +++ b/src/nf_nat_fullcone.h @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Nftables NAT extension: fullcone expression support library header + * + * Copyright (c) 2018 Chion Tang + * Original xt_FULLCONENAT and related iptables extension author + * Copyright (c) 2019-2022 GitHub/llccd Twitter/@gNodeB + * Added IPv6 support for xt_FULLCONENAT and ip6tables extension + * Ported to recent kernel versions + * Copyright (c) 2022 Syrone Wong + * Massively rewrite the whole module, split the original code into library and nftables 'fullcone' expression module + */ +#ifndef _NF_NAT_FULLCONE_H_ +#define _NF_NAT_FULLCONE_H_ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifndef NF_NAT_RANGE_PROTO_RANDOM_FULLY +#define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) +static inline int nf_ct_netns_get(struct net *net, u8 nfproto) +{ + return 0; +} + +static inline void nf_ct_netns_put(struct net *net, u8 nfproto) +{ +} +#endif + +/** + * enum nft_fullcone_attributes - nf_tables fullcone expression netlink attributes + * + * @NFTA_FULLCONE_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers) + * @NFTA_FULLCONE_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers) + * @NFTA_FULLCONE_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) + */ +enum nft_fullcone_attributes { + NFTA_FULLCONE_UNSPEC, + NFTA_FULLCONE_REG_PROTO_MIN, + NFTA_FULLCONE_REG_PROTO_MAX, + NFTA_FULLCONE_FLAGS, + __NFTA_FULLCONE_MAX +}; +#define NFTA_FULLCONE_MAX (__NFTA_FULLCONE_MAX - 1) + +/* fullcone specific data structures */ + +struct nat_mapping_original_tuple { + struct nf_conntrack_tuple tuple; + + struct list_head node; +}; + +struct nat_mapping { + uint16_t port; /* external source port */ + __be32 addr; /* external source ip address */ + + __be32 int_addr; /* internal source ip address */ + uint16_t int_port; /* internal source port */ + + int refer_count; /* how many references linked to this mapping + * aka. length of original_tuple_list */ + + struct list_head original_tuple_list; + + struct hlist_node node_by_ext_port; + struct hlist_node node_by_int_src; + +}; + +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) || (IS_ENABLED(CONFIG_IPV6) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)) +struct nat_mapping6 { + uint16_t port; /* external source port */ + union nf_inet_addr addr; /* external source ip address */ + + union nf_inet_addr int_addr; /* internal source ip address */ + uint16_t int_port; /* internal source port */ + + int refer_count; /* how many references linked to this mapping + * aka. length of original_tuple_list */ + + struct list_head original_tuple_list; + + struct hlist_node node_by_ext_port; + struct hlist_node node_by_int_src; + +}; +#endif + +struct tuple_list { + struct nf_conntrack_tuple tuple_original; + struct nf_conntrack_tuple tuple_reply; + struct list_head list; +}; + +/* fullcone specific data structures end */ + +// NOTE: declaration listed here must use EXPORT_SYMBOL_* + +unsigned int nf_nat_fullcone_ipv4(struct sk_buff *skb, unsigned int hooknum, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range, +#else + struct nf_nat_range *range, +#endif + const struct net_device *out); + +unsigned int nf_nat_fullcone_ipv6(struct sk_buff *skb, unsigned int hooknum, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range, +#else + struct nf_nat_range *range, +#endif + const struct net_device *out); + +void nf_nat_fullcone_handle_dying_tuples(void); +void nf_nat_fullcone_destroy_mappings(void); +void nf_nat_fullcone_dying_tuple_list_add(struct list_head *new_dying); + +/* + * For [FUTURE] usage + * + * from https://elixir.bootlin.com/linux/v5.15.32/source/net/netfilter/xt_nat.c#L37 +static void xt_nat_convert_range(struct nf_nat_range2 *dst, + const struct nf_nat_ipv4_range *src) +{ + memset(&dst->min_addr, 0, sizeof(dst->min_addr)); + memset(&dst->max_addr, 0, sizeof(dst->max_addr)); + // base_proto is nf_nat_range2 specific + memset(&dst->base_proto, 0, sizeof(dst->base_proto)); + + dst->flags = src->flags; + dst->min_addr.ip = src->min_ip; + dst->max_addr.ip = src->max_ip; + dst->min_proto = src->min; + dst->max_proto = src->max; +} + * + */ + +#endif /*_NF_NAT_FULLCONE_H_ */ diff --git a/src/nft_ext_fullcone.c b/src/nft_ext_fullcone.c new file mode 100644 index 0000000..c28947a --- /dev/null +++ b/src/nft_ext_fullcone.c @@ -0,0 +1,458 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Nftables NAT extension: fullcone expression support + * + * Copyright (c) 2018 Chion Tang + * Original xt_FULLCONENAT and related iptables extension author + * Copyright (c) 2019-2022 GitHub/llccd Twitter/@gNodeB + * Added IPv6 support for xt_FULLCONENAT and ip6tables extension + * Ported to recent kernel versions + * Copyright (c) 2022 Syrone Wong + * Massively rewrite the whole module, split the original code into library and nftables 'fullcone' expression module + */ +#define pr_fmt(fmt) "fullcone " KBUILD_MODNAME ": " fmt +#define NF_FULLCONE_WORKQUEUE_NAME "fullcone " KBUILD_MODNAME ": wq" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "nf_nat_fullcone.h" + +static void nft_fullcone_set_regs(const struct nft_expr *expr, const struct nft_regs *regs, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range); +#else + struct nf_nat_range *range); +#endif + +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +struct notifier_block ct_event_notifier; +#else +struct nf_ct_event_notifier ct_event_notifier; +#endif +static DEFINE_MUTEX(nf_ct_net_event_lock); +int ct_event_notifier_registered = 0; + +int module_refer_count = 0; + +static void gc_worker(struct work_struct *work); +static struct workqueue_struct *wq __read_mostly = NULL; +static DECLARE_DELAYED_WORK(gc_worker_wk, gc_worker); + +static void gc_worker(struct work_struct *work) +{ + nf_nat_fullcone_handle_dying_tuples(); +} + +struct nft_fullcone { + u32 flags; + u8 sreg_proto_min; + u8 sreg_proto_max; +}; + +static const struct nla_policy nft_fullcone_policy[NFTA_FULLCONE_MAX + 1] = { + [NFTA_FULLCONE_FLAGS] = {.type = NLA_U32 }, + [NFTA_FULLCONE_REG_PROTO_MIN] = {.type = NLA_U32 }, + [NFTA_FULLCONE_REG_PROTO_MAX] = {.type = NLA_U32 }, +}; + +/* conntrack destroy event callback function */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static int ct_event_cb(struct notifier_block *this, unsigned long events, void *ptr) +{ + struct nf_ct_event *item = ptr; +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) +static int ct_event_cb(unsigned int events, const struct nf_ct_event *item) +{ +#else +static int ct_event_cb(unsigned int events, struct nf_ct_event *item) +{ +#endif + struct nf_conn *ct; + struct nf_conntrack_tuple *ct_tuple_reply, *ct_tuple_original; + uint8_t protonum; + struct tuple_list *dying_tuple_item; + + ct = item->ct; + /* we handle only conntrack destroy events */ + if (ct == NULL || !(events & (1 << IPCT_DESTROY))) { + return 0; + } + + ct_tuple_original = &(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + + ct_tuple_reply = &(ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + protonum = (ct_tuple_original->dst).protonum; + if (protonum != IPPROTO_UDP) { + return 0; + } + + dying_tuple_item = kmalloc(sizeof(struct tuple_list), GFP_ATOMIC); + + if (dying_tuple_item == NULL) { + pr_debug("warning: ct_event_cb(): kmalloc failed.\n"); + return 0; + } + + memcpy(&(dying_tuple_item->tuple_original), ct_tuple_original, sizeof(struct nf_conntrack_tuple)); + memcpy(&(dying_tuple_item->tuple_reply), ct_tuple_reply, sizeof(struct nf_conntrack_tuple)); + + nf_nat_fullcone_dying_tuple_list_add(&(dying_tuple_item->list)); + + if (wq != NULL) + queue_delayed_work(wq, &gc_worker_wk, msecs_to_jiffies(100)); + + return 0; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) && !defined(CONFIG_NF_CONNTRACK_CHAIN_EVENTS) +static int exp_event_cb(unsigned int events, const struct nf_exp_event *item) +{ + return 0; +} +#endif + +static int nft_fullcone_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) +{ + int err; + + err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); + if (err < 0) + return err; + + // TODO: check hooks + return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING)); +} + +static int nft_fullcone_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr *const tb[]) +{ + int err; + int register_ct_notifier_ret = 0; + + err = nf_ct_netns_get(ctx->net, ctx->family); + + mutex_lock(&nf_ct_net_event_lock); + + module_refer_count++; + + pr_debug("nft_fullcone_init(): module_refer_count is now %d\n", module_refer_count); + + if (module_refer_count == 1) { +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + ct_event_notifier.notifier_call = ct_event_cb; +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) + ct_event_notifier.ct_event = ct_event_cb; + ct_event_notifier.exp_event = exp_event_cb; +#else + ct_event_notifier.fcn = ct_event_cb; +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) && !defined(CONFIG_NF_CONNTRACK_CHAIN_EVENTS) + if (!READ_ONCE(ctx->net->ct.nf_conntrack_event_cb)) { + nf_conntrack_register_notifier(ctx->net, &ct_event_notifier); + } +#else + register_ct_notifier_ret = nf_conntrack_register_notifier(ctx->net, &ct_event_notifier); +#endif + + if (register_ct_notifier_ret) { + /* non-zero means failure */ + pr_warn("failed to register a conntrack notifier. Disable active GC for mappings.\n"); + } else { + ct_event_notifier_registered = 1; + pr_debug("nft_fullcone_init(): ct_event_notifier registered\n"); + } + + } + + mutex_unlock(&nf_ct_net_event_lock); + + return err; +} + +static int nft_fullcone_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_fullcone *priv = nft_expr_priv(expr); + + if (priv->flags != 0 && nla_put_be32(skb, NFTA_FULLCONE_FLAGS, htonl(priv->flags))) + goto nla_put_failure; + + if (priv->sreg_proto_min) { + if (nft_dump_register(skb, NFTA_FULLCONE_REG_PROTO_MIN, + priv->sreg_proto_min) || + nft_dump_register(skb, NFTA_FULLCONE_REG_PROTO_MAX, priv->sreg_proto_max)) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + return -1; +} + +/* nft_fullcone_set_regs sets nft_regs from nft_expr fullcone specific private data */ +static void nft_fullcone_set_regs(const struct nft_expr *expr, const struct nft_regs *regs, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 *range +#else + struct nf_nat_range *range +#endif + ) +{ + // private data connected via nft_expr_type.ops <==> nft_expr_ops.type + // private data type from nft_expr_type.{policy,maxattr,ops} + // private data size from nft_expr_ops.size + struct nft_fullcone *priv = nft_expr_priv(expr); + range->flags = priv->flags; + if (priv->sreg_proto_min) { + range->min_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_min]); + range->max_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_max]); + } +} + +static void nft_fullcone_ipv4_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 range; +#else + struct nf_nat_range range; +#endif + + memset(&range, 0, sizeof(range)); + nft_fullcone_set_regs(expr, regs, &range); + regs->verdict.code = nf_nat_fullcone_ipv4(pkt->skb, nft_hook(pkt), &range, nft_out(pkt)); +} + +static void nft_fullcone_common_destory(const struct nft_ctx *ctx) +{ + mutex_lock(&nf_ct_net_event_lock); + + module_refer_count--; + + pr_debug("nft_fullcone_common_destory(): module_refer_count is now %d\n", module_refer_count); + + if (module_refer_count == 0) { + if (ct_event_notifier_registered) { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) && !defined(CONFIG_NF_CONNTRACK_CHAIN_EVENTS) + nf_conntrack_unregister_notifier(ctx->net); +#else + nf_conntrack_unregister_notifier(ctx->net, &ct_event_notifier); +#endif + ct_event_notifier_registered = 0; + + pr_debug("nft_fullcone_common_destory(): ct_event_notifier unregistered\n"); + + } + } + + mutex_unlock(&nf_ct_net_event_lock); +} + +static void nft_fullcone_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + nft_fullcone_common_destory(ctx); + nf_ct_netns_put(ctx->net, NFPROTO_IPV4); +} + +static struct nft_expr_type nft_fullcone_ipv4_type; +static const struct nft_expr_ops nft_fullcone_ipv4_ops = { + .type = &nft_fullcone_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fullcone)), + .eval = nft_fullcone_ipv4_eval, + .init = nft_fullcone_init, + .destroy = nft_fullcone_ipv4_destroy, + .dump = nft_fullcone_dump, + .validate = nft_fullcone_validate, +}; + +static struct nft_expr_type nft_fullcone_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "fullcone", + .ops = &nft_fullcone_ipv4_ops, + .policy = nft_fullcone_policy, + .maxattr = NFTA_FULLCONE_MAX, + .owner = THIS_MODULE, +}; + +#ifdef CONFIG_NF_TABLES_IPV6 +static void nft_fullcone_ipv6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) + struct nf_nat_range2 range; +#else + struct nf_nat_range range; +#endif + + memset(&range, 0, sizeof(range)); + nft_fullcone_set_regs(expr, regs, &range); + regs->verdict.code = nf_nat_fullcone_ipv6(pkt->skb, nft_hook(pkt), &range, nft_out(pkt)); +} + +static void nft_fullcone_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + nft_fullcone_common_destory(ctx); + nf_ct_netns_put(ctx->net, NFPROTO_IPV6); +} + +static struct nft_expr_type nft_fullcone_ipv6_type; +static const struct nft_expr_ops nft_fullcone_ipv6_ops = { + .type = &nft_fullcone_ipv6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fullcone)), + .eval = nft_fullcone_ipv6_eval, + .init = nft_fullcone_init, + .destroy = nft_fullcone_ipv6_destroy, + .dump = nft_fullcone_dump, + .validate = nft_fullcone_validate, +}; + +static struct nft_expr_type nft_fullcone_ipv6_type __read_mostly = { + .family = NFPROTO_IPV6, + .name = "fullcone", + .ops = &nft_fullcone_ipv6_ops, + .policy = nft_fullcone_policy, + .maxattr = NFTA_FULLCONE_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_fullcone_module_init_ipv6(void) +{ + return nft_register_expr(&nft_fullcone_ipv6_type); +} + +static void nft_fullcone_module_exit_ipv6(void) +{ + nft_unregister_expr(&nft_fullcone_ipv6_type); +} +#else +static inline int nft_fullcone_module_init_ipv6(void) +{ + return 0; +} + +static inline void nft_fullcone_module_exit_ipv6(void) +{ +} +#endif + +#ifdef CONFIG_NF_TABLES_INET +static void nft_fullcone_inet_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) +{ + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + return nft_fullcone_ipv4_eval(expr, regs, pkt); + case NFPROTO_IPV6: + return nft_fullcone_ipv6_eval(expr, regs, pkt); + } + + WARN_ON_ONCE(1); +} + +static void nft_fullcone_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + nft_fullcone_common_destory(ctx); + nf_ct_netns_put(ctx->net, NFPROTO_INET); +} + +static struct nft_expr_type nft_fullcone_inet_type; +static const struct nft_expr_ops nft_fullcone_inet_ops = { + .type = &nft_fullcone_inet_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fullcone)), + .eval = nft_fullcone_inet_eval, + .init = nft_fullcone_init, + .destroy = nft_fullcone_inet_destroy, + .dump = nft_fullcone_dump, + .validate = nft_fullcone_validate, +}; + +static struct nft_expr_type nft_fullcone_inet_type __read_mostly = { + .family = NFPROTO_INET, + .name = "fullcone", + .ops = &nft_fullcone_inet_ops, + .policy = nft_fullcone_policy, + .maxattr = NFTA_FULLCONE_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_fullcone_module_init_inet(void) +{ + return nft_register_expr(&nft_fullcone_inet_type); +} + +static void nft_fullcone_module_exit_inet(void) +{ + nft_unregister_expr(&nft_fullcone_inet_type); +} +#else +static inline int nft_fullcone_module_init_inet(void) +{ + return 0; +} + +static inline void nft_fullcone_module_exit_inet(void) +{ +} +#endif + +static int __init nft_fullcone_module_init(void) +{ + int ret; + + ret = nft_fullcone_module_init_ipv6(); + if (ret < 0) + return ret; + + ret = nft_fullcone_module_init_inet(); + if (ret < 0) { + nft_fullcone_module_exit_ipv6(); + return ret; + } + + ret = nft_register_expr(&nft_fullcone_ipv4_type); + if (ret < 0) { + nft_fullcone_module_exit_inet(); + nft_fullcone_module_exit_ipv6(); + return ret; + } + + wq = create_singlethread_workqueue(NF_FULLCONE_WORKQUEUE_NAME); + if (wq == NULL) { + pr_err("failed to create workqueue %s\n", NF_FULLCONE_WORKQUEUE_NAME); + } + + return ret; +} + +static void __exit nft_fullcone_module_exit(void) +{ + nft_fullcone_module_exit_ipv6(); + nft_fullcone_module_exit_inet(); + nft_unregister_expr(&nft_fullcone_ipv4_type); + + if (wq) { + cancel_delayed_work_sync(&gc_worker_wk); + flush_workqueue(wq); + destroy_workqueue(wq); + } + + nf_nat_fullcone_handle_dying_tuples(); + nf_nat_fullcone_destroy_mappings(); +} + +module_init(nft_fullcone_module_init); +module_exit(nft_fullcone_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Syrone Wong "); +MODULE_ALIAS_NFT_EXPR("fullcone"); +MODULE_DESCRIPTION("Netfilter nftables fullcone expression support of RFC3489 full cone NAT");