歡迎來到Linux教程網
Linux教程網
Linux教程網
Linux教程網
您现在的位置: Linux教程網 >> UnixLinux >  >> Linux綜合 >> Linux內核

Linux內核學習:netlink的內核實現原理

注:
     當用戶態進程發送數據時,調用sendmsg實現,其調用內核netlink_sendmsg函數完成,新建了sk_buff,然後給其cb私有緩存中保存了源地址信息,然後把數據拷貝到sk_buff中[nlmsghdr頭部已經附在數據部分前面,作為數據部分了]然後利用netlink_unicast發送出去
      而當內核態發送時,新建了一個sk_buff,頭部填寫了nlmsghdr[利用了nlmsg_put實現]結構信息,然後是數據部分,而且給其cb私有部分NETLINK_CB(skb_1)填寫了本地信息,然後利用netlink_unicast發送出去
                                             netlink結構體簡析
netlink的實現主要在/net/netlink/af_netlink.c文件中,結構如下:
1 內核中的netlink協議類型,AF_NETLINK為netlink協議族
#define NETLINK_ROUTE  0 /* Routing/device hook    */
#define NETLINK_UNUSED  1 /* Unused number    */
#define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols  */
#define NETLINK_FIREWALL 3 /* Firewalling hook    */
#define NETLINK_INET_DIAG 4 /* INET socket monitoring   */
#define NETLINK_NFLOG  5 /* netfilter/iptables ULOG */
#define NETLINK_XFRM  6 /* ipsec */
#define NETLINK_SELINUX  7 /* SELinux event notifications */
#define NETLINK_ISCSI  8 /* Open-iSCSI */
#define NETLINK_AUDIT  9 /* auditing */
#define NETLINK_FIB_LOOKUP 10 
#define NETLINK_CONNECTOR 11
#define NETLINK_NETFILTER 12 /* netfilter subsystem */
#define NETLINK_IP6_FW  13
#define NETLINK_DNRTMSG  14 /* DECnet routing messages */
#define NETLINK_KOBJECT_UEVENT 15 /* Kernel messages to userspace */
#define NETLINK_GENERIC  16
/* leave room for NETLINK_DM (DM Events) */
#define NETLINK_SCSITRANSPORT 18 /* SCSI Transports */
#define NETLINK_ECRYPTFS 19
#define MAX_LINKS 32  
2 netlink 地址格式
struct sockaddr_nl
{
 sa_family_t nl_family; /* AF_NETLINK */
 unsigned short nl_pad;  /* zero  */
 __u32  nl_pid;  /* port ID */
        __u32  nl_groups; /* multicast groups mask */
};
3 netlink消息頭
struct nlmsghdr
{
 __u32  nlmsg_len; /* Length of message including header */
 __u16  nlmsg_type; /* Message content */
 __u16  nlmsg_flags; /* Additional flags */
 __u32  nlmsg_seq; /* Sequence number */
 __u32  nlmsg_pid; /* Sending process port ID */
};
4 netlink 套接字結構
struct netlink_sock {
 /* struct sock has to be the first member of netlink_sock */
 struct sock  sk;
 u32   pid; //內核自己的pid,=0
 u32   dst_pid;
 u32   dst_group;對方的組
 u32   flags;
 u32   subscriptions;
 u32   ngroups;組數量
 unsigned long  *groups;組號
 unsigned long  state;
 wait_queue_head_t wait; 進程在接收數據包時等待隊列
 struct netlink_callback *cb;
 struct mutex  *cb_mutex;
 struct mutex  cb_def_mutex;
 void   (*netlink_rcv)(struct sk_buff *skb); //內核態接收到用戶態信息後的處理函數
 struct module  *module;
};
5 skb_buff結構中對應的netlink相關的信息
內核態存儲自己發送地址等信息,在數據報傳到用戶態後,用戶態可能要獲取其中信息
struct netlink_skb_parms
{
 struct ucred  creds;    /* Skb credentials */
 __u32   pid;
 __u32   dst_group;
 kernel_cap_t  eff_cap;
 __u32   loginuid; /* Login (audit) uid */
 __u32   sessionid; /* Session id (audit) */
 __u32   sid;  /* SELinux security id */
};
#define NETLINK_CB(skb)  (*(struct netlink_skb_parms*)&((skb)->cb))
6 內核中所有的netlink套接字存儲在一個全局的哈新表中,該結構定義如下
static struct netlink_table *nl_table;其中每個協議對應一個哈希表,所有的同一種協議的數據報散列在同哈希表中
下面為一種協議所連接的哈希表結構:
struct netlink_table {
struct nl_pid_hash hash;   // 根據pid進行HASH的netlink sock鏈表, 相當於客戶端鏈表
struct hlist_head mc_list; // 多播的sock鏈表
unsigned long *listeners;  // 監聽者標志
unsigned int nl_nonroot;
unsigned int groups;  // 每個netlink的協議類型可以定義多個組, 8的倍數,最小是32
struct module *module;
int registered;
};
最大可有MAX_LINKS(32)個表,處理不同協議類型的netlink套接口, 注意由於是自身的通信, 本機同時作為服務器和客戶端, 服務端需要一個套接口對應, 每個客戶端也要有一個套接口對應, 多個客戶端的套接口形成一個鏈表.
struct nl_pid_hash {
struct hlist_head *table; // 鏈表節點,每個桶中協議的sock連入其中,根據哈希值可得確定的sock
unsigned long rehash_time; // 重新計算HASH的時間間隔
unsigned int mask;
unsigned int shift;
unsigned int entries;     // 鏈表節點數
unsigned int max_shift;  // 最大冪值
u32 rnd;   // 隨機數
};
7 一些與netlink相關的宏
#define NLMSG_ALIGNTO 4
#define NLMSG_ALIGN(len) ( ((len)+NLMSG_ALIGNTO-1) & ~(NLMSG_ALIGNTO-1) )
#define NLMSG_HDRLEN  ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr)))
#define NLMSG_LENGTH(len) ((len)+NLMSG_ALIGN(NLMSG_HDRLEN))
#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len)) 含len長度數據+nlmsghdr的總長度
#define NLMSG_DATA(nlh)  ((void*)(((char*)nlh) + NLMSG_LENGTH(0))) 獲nlmsghdr頭部之後的數據
#define NLMSG_NEXT(nlh,len)  ((len) -= NLMSG_ALIGN((nlh)->nlmsg_len), \
      (struct nlmsghdr*)(((char*)(nlh)) + NLMSG_ALIGN((nlh)->nlmsg_len)))
#define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \
      (nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \
      (nlh)->nlmsg_len <= (len))
#define NLMSG_PAYLOAD(nlh,len) ((nlh)->nlmsg_len - NLMSG_SPACE((len)))
8 netlink的函數操作集合[掛載於socket->ops結構中]
static const struct proto_ops netlink_ops = {
 .family = PF_NETLINK,
 .owner = THIS_MODULE,
 .release = netlink_release,
 .bind =  netlink_bind,
 .connect = netlink_connect,
 .socketpair = sock_no_socketpair,
 .accept = sock_no_accept,
 .getname = netlink_getname,
 .poll =  datagram_poll,
 .ioctl = sock_no_ioctl,
 .listen = sock_no_listen,
 .shutdown = sock_no_shutdown,
 .setsockopt = netlink_setsockopt,
 .getsockopt = netlink_getsockopt,
 .sendmsg = netlink_sendmsg,   netlink 套接字實際的發送和接受函數
 .recvmsg = netlink_recvmsg,
 .mmap =  sock_no_mmap,
 .sendpage = sock_no_sendpage,
};
                                             函數跟蹤分析
1 內核中創建netlink函數
建立socket、sock結構並初始化
/*
 * We export these functions to other modules. They provide a
 * complete set of kernel non-blocking support for message
 * queueing.
 */

struct sock *
netlink_kernel_create(struct net *net, int unit, unsigned int groups,
        void (*input)(struct sk_buff *skb),struct mutex *cb_mutex, struct module *module)
{
 struct socket *sock;
 struct sock *sk;
 struct netlink_sock *nlk;
 unsigned long *listeners = NULL;

 BUG_ON(!nl_table);

 if (unit < 0 || unit >= MAX_LINKS)
  return NULL;

 if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) //創建socket結構
  return NULL;

 /*
  * We have to just have a reference on the net from sk, but don't
  * get_net it. Besides, we cannot get and then put the net here.
  * So we create one inside init_net and the move it to net.
  */

 if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0)  創建sock結構,並初始化
  goto out_sock_release_nosk;

 sk = sock->sk;
 sk_change_net(sk, net);

 if (groups < 32)
  groups = 32;

 listeners = kzalloc(NLGRPSZ(groups) + sizeof(struct listeners_rcu_head),
       GFP_KERNEL);
 if (!listeners)
  goto out_sock_release;

 sk->sk_data_ready = netlink_data_ready; //什麼都不做
 if (input)
  nlk_sk(sk)->netlink_rcv = input; //設置內核態的接受函數

 if (netlink_insert(sk, net, 0))
  goto out_sock_release;

 nlk = nlk_sk(sk);取得sock嵌入的netlink_sock結構體
 nlk->flags |= NETLINK_KERNEL_SOCKET;

 netlink_table_grab();
 if (!nl_table[unit].registered) {
  nl_table[unit].groups = groups;
  nl_table[unit].listeners = listeners;
  nl_table[unit].cb_mutex = cb_mutex;
  nl_table[unit].module = module;
  nl_table[unit].registered = 1; 更新netlink_table結構體信息,每中協議對應一個netlink_table結構
 } else {
  kfree(listeners);
  nl_table[unit].registered++;
 }
 netlink_table_ungrab();
 return sk;

out_sock_release:
 kfree(listeners);
 netlink_kernel_release(sk);
 return NULL;

out_sock_release_nosk:
 sock_release(sock);
 return NULL;
}
EXPORT_SYMBOL(netlink_kernel_create);
1.1 創建socket結構
int sock_create_lite(int family, int type, int protocol, struct socket **res)
{
 int err;
 struct socket *sock = NULL;

 err = security_socket_create(family, type, protocol, 1); 空
 if (err)
  goto out;

 sock = sock_alloc();
 if (!sock) {
  err = -ENOMEM;
  goto out;
 }

 sock->type = type;
 err = security_socket_post_create(sock, family, type, protocol, 1);//啥都不干
 if (err)
  goto out_release;

out:
 *res = sock;
 return err;
out_release:
 sock_release(sock);
 sock = NULL;
 goto out;
}
1.2 創建sock結構並初始化
static int __netlink_create(struct net *net, struct socket *sock,
       struct mutex *cb_mutex, int protocol)
{
 struct sock *sk;
 struct netlink_sock *nlk;

 sock->ops = &netlink_ops;

 sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);//struct proto沒用
 if (!sk)
  return -ENOMEM;

 sock_init_data(sock, sk); // sk和sock初始化

 nlk = nlk_sk(sk); 取得netlink_sock結構
 if (cb_mutex)
  nlk->cb_mutex = cb_mutex;
 else {
  nlk->cb_mutex = &nlk->cb_def_mutex;
  mutex_init(nlk->cb_mutex);
 }
 init_waitqueue_head(&nlk->wait); //初始化等待隊列

 sk->sk_destruct = netlink_sock_destruct;
 sk->sk_protocol = protocol;
 return 0;
}
1.2.1 sock結構和socket結構初始化過程
void sock_init_data(struct socket *sock, struct sock *sk)
{
 skb_queue_head_init(&sk->sk_receive_queue);
 skb_queue_head_init(&sk->sk_write_queue);
 skb_queue_head_init(&sk->sk_error_queue);
#ifdef CONFIG_NET_DMA
 skb_queue_head_init(&sk->sk_async_wait_queue);
#endif

 sk->sk_send_head = NULL;

 init_timer(&sk->sk_timer);

 sk->sk_allocation = GFP_KERNEL;
 sk->sk_rcvbuf  = sysctl_rmem_default;
 sk->sk_sndbuf  = sysctl_wmem_default;
 sk->sk_state  = TCP_CLOSE;
 sk_set_socket(sk, sock);

 sock_set_flag(sk, SOCK_ZAPPED);

 if (sock) {
  sk->sk_type = sock->type;
  sk->sk_sleep = &sock->wait;
  sock->sk = sk;
 } else
  sk->sk_sleep = NULL;

 rwlock_init(&sk->sk_dst_lock);
 rwlock_init(&sk->sk_callback_lock);
 lockdep_set_class_and_name(&sk->sk_callback_lock,
   af_callback_keys + sk->sk_family,
   af_family_clock_key_strings[sk->sk_family]);

 sk->sk_state_change = sock_def_wakeup;
 sk->sk_data_ready = sock_def_readable;
 sk->sk_write_space = sock_def_write_space;
 sk->sk_error_report = sock_def_error_report;
 sk->sk_destruct  = sock_def_destruct;

 sk->sk_sndmsg_page = NULL;
 sk->sk_sndmsg_off = 0;

 sk->sk_peercred.pid  = 0;
 sk->sk_peercred.uid = -1;
 sk->sk_peercred.gid = -1;
 sk->sk_write_pending = 0;
 sk->sk_rcvlowat  = 1;
 sk->sk_rcvtimeo  = MAX_SCHEDULE_TIMEOUT;
 sk->sk_sndtimeo  = MAX_SCHEDULE_TIMEOUT;

 sk->sk_stamp = ktime_set(-1L, 0);

 /*
  * Before updating sk_refcnt, we must commit prior changes to memory
  * (Documentation/RCU/rculist_nulls.txt for details)
  */
 smp_wmb();
 atomic_set(&sk->sk_refcnt, 1);
 atomic_set(&sk->sk_drops, 0);
}
2 新建sk_buff結構,當內核態向用戶態發送信息時,必須得建新的sk_buff結構
static inline struct sk_buff *alloc_skb(unsigned int size,
     gfp_t priority)
{
 return __alloc_skb(size, priority, 0, -1);
}
/**
 * __alloc_skb - allocate a network buffer
 * @size: size to allocate
 * @gfp_mask: allocation mask
 * @fclone: allocate from fclone cache instead of head cache
 *  and allocate a cloned (child) skb
 * @node: numa node to allocate memory on
 *
 * Allocate a new &sk_buff. The returned buffer has no headroom and a
 * tail room of size bytes. The object has a reference count of one.
 * The return is the buffer. On a failure the return is %NULL.
 *
 * Buffers may only be allocated from interrupts using a @gfp_mask of
 * %GFP_ATOMIC.
 */
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
       int fclone, int node)
{
 struct kmem_cache *cache;
 struct skb_shared_info *shinfo;
 struct sk_buff *skb;
 u8 *data;

 cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;

 /* Get the HEAD */
 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
 if (!skb)
  goto out;

 size = SKB_DATA_ALIGN(size);
 data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
   gfp_mask, node); //返回新建空間首地址
 if (!data)
  goto nodata;

 /*
  * Only clear those fields we need to clear, not those that we will
  * actually initialise below. Hence, don't put any more fields after
  * the tail pointer in struct sk_buff!
  */
 memset(skb, 0, offsetof(struct sk_buff, tail));
 skb->truesize = size + sizeof(struct sk_buff);
 atomic_set(&skb->users, 1);
 skb->head = data;
 skb->data = data;
 skb_reset_tail_pointer(skb); //skb->tail = skb->data在剛開始時候
 skb->end = skb->tail + size;
 kmemcheck_annotate_bitfield(skb, flags1);
 kmemcheck_annotate_bitfield(skb, flags2);
#ifdef NET_SKBUFF_DATA_USES_OFFSET
 skb->mac_header = ~0U;
#endif

 /* make sure we initialize shinfo sequentially */
 shinfo = skb_shinfo(skb);該表skb_buff尾部結構信息
 atomic_set(&shinfo->dataref, 1);
 shinfo->nr_frags  = 0;
 shinfo->gso_size = 0;
 shinfo->gso_segs = 0;
 shinfo->gso_type = 0;
 shinfo->ip6_frag_id = 0;
 shinfo->tx_flags.flags = 0;
 skb_frag_list_init(skb);
 memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps));

 if (fclone) {
  struct sk_buff *child = skb + 1;
  atomic_t *fclone_ref = (atomic_t *) (child + 1);

  kmemcheck_annotate_bitfield(child, flags1);
  kmemcheck_annotate_bitfield(child, flags2);
  skb->fclone = SKB_FCLONE_ORIG;
  atomic_set(fclone_ref, 1);

  child->fclone = SKB_FCLONE_UNAVAILABLE;
 }
out:
 return skb;
nodata:
 kmem_cache_free(cache, skb);
 skb = NULL;
 goto out;
}
3 釋放以使用完畢的sk_buff空間
/**
 * kfree_skb - free an sk_buff
 * @skb: buffer to free
 *
 * Drop a reference to the buffer and free it if the usage count has
 * hit zero.
 */
void kfree_skb(struct sk_buff *skb)
{
 if (unlikely(!skb))
  return;
 if (likely(atomic_read(&skb->users) == 1))
  smp_rmb();
 else if (likely(!atomic_dec_and_test(&skb->users)))
  return;
 trace_kfree_skb(skb, __builtin_return_address(0));
 __kfree_skb(skb);
}
4 kernel向user發送的sk_buff信息中格式為: nlmsghdr+內容,所以在內核態分配好並初始化好sk_buff之後,可以利用nlmsg_put函數來填寫其nlmsghdr頭部[只是一些相關協議的信息,與地址等無關]
/**
 * nlmsg_put - Add a new netlink message to an skb
 * @skb: socket buffer to store message in
 * @pid: netlink process id,for kernel ,pid=0
 * @seq: sequence number of message
 * @type: message type
 * @payload: length of message payload 該數據信息可承載的實際數據大小
 * @flags: message flags
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the message header and payload.
 */
static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 pid, u32 seq,
      int type, int payload, int flags)
{
 if (unlikely(skb_tailroom(skb) < nlmsg_total_size(payload)))
  return NULL;

 return __nlmsg_put(skb, pid, seq, type, payload, flags);
}
內核中填寫nlmsghdr頭部信息
static __inline__ struct nlmsghdr *
__nlmsg_put(struct sk_buff *skb, u32 pid, u32 seq, int type, int len, int flags)
{
 struct nlmsghdr *nlh;
 int size = NLMSG_LENGTH(len);  size = len+aline_to_4(nlmsghdr)

 nlh = (struct nlmsghdr*)skb_put(skb, NLMSG_ALIGN(size)); tail=data+nlmsg_align(size),返回data
 nlh->nlmsg_type = type;
 nlh->nlmsg_len = size;
 nlh->nlmsg_flags = flags;
 nlh->nlmsg_pid = pid;
 nlh->nlmsg_seq = seq;
 if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
  memset(NLMSG_DATA(nlh) + len, 0, NLMSG_ALIGN(size) - size);
 return nlh;
}
unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
{
 unsigned char *tmp = skb_tail_pointer(skb);
 SKB_LINEAR_ASSERT(skb);
 skb->tail += len;
 skb->len  += len;
 if (unlikely(skb->tail > skb->end))
  skb_over_panic(skb, len, __builtin_return_address(0));
 return tmp;
}
5 內核態向用戶態發送信息的函數,pid指向用戶態的進程
int netlink_unicast(struct sock *ssk, struct sk_buff *skb,u32 pid, int nonblock)
{
 struct sock *sk;
 int err;
 long timeo;

 skb = netlink_trim(skb, gfp_any());

 timeo = sock_sndtimeo(ssk, nonblock);
retry:
 sk = netlink_getsockbypid(ssk, pid); 找到內核中對應該pid和SOCK_DGRAM協議的sock結構,掛載在哈希表中,這樣可以找到另一端的sock結構,另一端可以為:用戶態也可以為另一個內核端
 if (IS_ERR(sk)) {
  kfree_skb(skb);
  return PTR_ERR(sk);
 }
 if (netlink_is_kernel(sk)) //另一端也為內核端,調用其netlink_sock的input函數接受
  return netlink_unicast_kernel(sk, skb);

 if (sk_filter(sk, skb)) {
  err = skb->len;
  kfree_skb(skb);
  sock_put(sk);
  return err;
 }

 err = netlink_attachskb(sk, skb, &timeo, ssk);
 if (err == 1)
  goto retry;
 if (err)
  return err;

 return netlink_sendskb(sk, skb); //掛載到用戶態sock結構的receive_queue隊列中, 然後調用sk_data_ready函數,喚醒用戶態睡眠函數
}
EXPORT_SYMBOL(netlink_unicast);
5-0 netlink_getsockbypid
static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid)
{
 struct sock *sock;
 struct netlink_sock *nlk;

 sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, pid);
 if (!sock)
  return ERR_PTR(-ECONNREFUSED);

 /* Don't bother queuing skb if kernel socket has no input function */
 nlk = nlk_sk(sock);
 if (sock->sk_state == NETLINK_CONNECTED &&
     nlk->dst_pid != nlk_sk(ssk)->pid) {
  sock_put(sock);
  return ERR_PTR(-ECONNREFUSED);
 }
 return sock;
}
查找主函數:
static inline struct sock *netlink_lookup(struct net *net, int protocol,//協議,可以自己定義
       u32 pid)
{
 struct nl_pid_hash *hash = &nl_table[protocol].hash;找到哈希表的頭
 struct hlist_head *head;
 struct sock *sk;
 struct hlist_node *node;

 read_lock(&nl_table_lock);
 head = nl_pid_hashfn(hash, pid);根據pid散列值找到sock所在的鏈表
 sk_for_each(sk, node, head) {  在鏈表中匹配
  if (net_eq(sock_net(sk), net) && (nlk_sk(sk)->pid == pid)) {
   sock_hold(sk);
   goto found;
  }
 }
 sk = NULL;
found:
 read_unlock(&nl_table_lock);
 return sk;
}
5-1  將sock和sk_buff綁定在一起
/*
 * Attach a skb to a netlink socket.
 * The caller must hold a reference to the destination socket. On error, the
 * reference is dropped. The skb is not send to the destination, just all
 * all error checks are performed and memory in the queue is reserved.
 * Return values:
 * < 0: error. skb freed, reference to sock dropped.
 * 0: continue
 * 1: repeat lookup - reference dropped while waiting for socket memory.
 */
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
        long *timeo, struct sock *ssk)
{
 struct netlink_sock *nlk;

 nlk = nlk_sk(sk);
如果sock的接受緩沖區小於sock中接受隊列已提交的字節數,或者還有數據數據未被上層處理
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||test_bit(0, &nlk->state)) {
  DECLARE_WAITQUEUE(wait, current); 聲明等待隊列
  if (!*timeo) {
   if (!ssk || netlink_is_kernel(ssk))
       netlink_overrun(sk);
   sock_put(sk);
   kfree_skb(skb);
   return -EAGAIN;
  }

  __set_current_state(TASK_INTERRUPTIBLE);
  add_wait_queue(&nlk->wait, &wait);  設置自身狀態,等待睡眠

  if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
       test_bit(0, &nlk->state)) &&
      !sock_flag(sk, SOCK_DEAD))
   *timeo = schedule_timeout(*timeo);

  __set_current_state(TASK_RUNNING);
  remove_wait_queue(&nlk->wait, &wait);
  sock_put(sk);遞減sk引用次數

  if (signal_pending(current)) {
   kfree_skb(skb);
   return sock_intr_errno(*timeo);
  }
  return 1; //重新將sk_buff放在客戶端sock上
 }
 skb_set_owner_r(skb, sk);
 return 0;
}
改變sock->sk_rmem_alloc大小,以及skb->sk變量
static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
 skb_orphan(skb);
 skb->sk = sk;
 skb->destructor = sock_rfree;
 atomic_add(skb->truesize, &sk->sk_rmem_alloc);
 sk_mem_charge(sk, skb->truesize);
}
5.2 內核態將數據發送出去
int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
 int len = skb->len;

 skb_queue_tail(&sk->sk_receive_queue, skb);
 sk->sk_data_ready(sk, len);喚醒等待進程,進行接受工作
 sock_put(sk);
 return len;
}
6 用戶態接受消息
此時內核態將數據發送到用戶態對應的sock的sk_receive_queue中,並且喚醒睡眠的進程
static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
      struct msghdr *msg, size_t len,
      int flags)
{//sock的io控制塊
 struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
 struct scm_cookie scm;
 struct sock *sk = sock->sk; 獲取用戶態進程的sock結構
 struct netlink_sock *nlk = nlk_sk(sk);
 int noblock = flags&MSG_DONTWAIT;
 size_t copied;
 struct sk_buff *skb, *data_skb;
 int err;

 if (flags&MSG_OOB)
  return -EOPNOTSUPP;

 copied = 0;

 skb = skb_recv_datagram(sk, flags, noblock, &err);從等待隊列中接受一個數據包
 if (skb == NULL)
  goto out;

 data_skb = skb;

#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
 if (unlikely(skb_shinfo(skb)->frag_list)) {
  /*
   * If this skb has a frag_list, then here that means that we
   * will have to use the frag_list skb's data for compat tasks
   * and the regular skb's data for normal (non-compat) tasks.
   *
   * If we need to send the compat skb, assign it to the
   * 'data_skb' variable so that it will be used below for data
   * copying. We keep 'skb' for everything else, including
   * freeing both later.
   */
  if (flags & MSG_CMSG_COMPAT)
   data_skb = skb_shinfo(skb)->frag_list;
 }
#endif

 msg->msg_namelen = 0;

 copied = data_skb->len; 接收數據的長度
 if (len < copied) {  //如果用戶需要的長度小於數據報的長度,則丟棄剩余的數據包
  msg->msg_flags |= MSG_TRUNC;
  copied = len;
 }

 skb_reset_transport_header(data_skb); //將數據包從sk_buff中拷貝到msghdr結構中
 err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied);

 if (msg->msg_name) { 如果要求得到kernel的struct netlink_nl結構,那麼...
  struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name;
  addr->nl_family = AF_NETLINK;
  addr->nl_pad    = 0;
  addr->nl_pid = NETLINK_CB(skb).pid; //內核態自己的pid信息
  addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group);//發送分組
  msg->msg_namelen = sizeof(*addr);
 }
接收數據報信息標志,將消息頭拷貝到用戶空間
if (nlk->flags & NETLINK_RECV_PKTINFO)
  netlink_cmsg_recv_pktinfo(msg, skb);

 if (NULL == siocb->scm) {
  memset(&scm, 0, sizeof(scm));
  siocb->scm = &scm;
 }
 siocb->scm->creds = *NETLINK_CREDS(skb);
 if (flags & MSG_TRUNC)
  copied = data_skb->len;

 skb_free_datagram(sk, skb); //釋放sk_buff數據報信息

 if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2)
  netlink_dump(sk);

 scm_recv(sock, msg, siocb->scm, flags);
out:
 netlink_rcv_wake(sk);接受喚醒
 return err ? : copied;
}
6.1 從sock->sk_receive_buf中摘除一個sk_buff結構
從sock等待隊列中摘除一個sk_buff結構:
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
        int *peeked, int *err)
{
 struct sk_buff *skb;
 long timeo;
 /*
  * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
  */
 int error = sock_error(sk);

 if (error)
  goto no_packet;

 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);

 do {
  /* Again only user level code calls this function, so nothing
   * interrupt level will suddenly eat the receive_queue.
   *
   * Look at current nfs client by the way...
   * However, this function was corrent in any case. 8)
   */
  unsigned long cpu_flags;

  spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
  skb = skb_peek(&sk->sk_receive_queue);
  if (skb) {
   *peeked = skb->peeked;
   if (flags & MSG_PEEK) {
    skb->peeked = 1;
    atomic_inc(&skb->users);
   } else
    __skb_unlink(skb, &sk->sk_receive_queue);
  }
  spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);

  if (skb)
   return skb;

  /* User doesn't want to wait */
  error = -EAGAIN;
  if (!timeo)
   goto no_packet;

 } while (!wait_for_packet(sk, err, &timeo));

 return NULL;

no_packet:
 *err = error;
 return NULL;
}
摘除過程:
static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
 struct sk_buff *next, *prev;

 list->qlen--;
 next    = skb->next;
 prev    = skb->prev;
 skb->next  = skb->prev = NULL;
 next->prev = prev;
 prev->next = next;
}
6.2 將該sk_buff內容轉換為msghdr結構
內核態采用netlink_unicast函數發送數據,數據結構存儲為:nlmsghdr+data,內核發送即以這種方式組sk_buff
而用戶態采用sendmsg等格式,發送數據方式不是sk_buff,而是msghdr格式,該格式定義如下:
/*
 * As we do 4.4BSD message passing we use a 4.4BSD message passing
 * system, not 4.3. Thus msg_accrights(len) are now missing. They
 * belong in an obscure libc emulation or the bin.
 */
struct msghdr {  位於linux/socket.h頭文件中
 void * msg_name; /* Socket name   */  一般存儲對方(發送方的地址信息,即netlink_nl信息)
 int  msg_namelen; /* Length of name  */
 struct iovec * msg_iov; /* Data blocks   */ 存儲來自對方的實際數據信息
 __kernel_size_t msg_iovlen; /* Number of blocks  */
 void  * msg_control; /* Per protocol magic (eg BSD file descriptor passing) */
 __kernel_size_t msg_controllen; /* Length of cmsg list */
 unsigned msg_flags;
};
該數據分組結構單元定義如下:
struct iovec
{
 void __user *      iov_base; /* BSD uses caddr_t (1003.1g requires void *) */
 __kernel_size_t   iov_len; /* Must be size_t (1003.1g) */
};
轉換函數如下定義:
/**
 * skb_copy_datagram_iovec - Copy a datagram to an iovec.
 * @skb: buffer to copy
 * @offset: offset in the buffer to start copying from
 * @to: io vector to copy to
 * @len: amount of data to copy from buffer to iovec
 *
 * Note: the iovec is modified during the copy.
 */
int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
       struct iovec *to, int len)
{
 int start = skb_headlen(skb);
 int i, copy = start - offset;
 struct sk_buff *frag_iter;

 trace_skb_copy_datagram_iovec(skb, len);

 /* Copy header. */
 if (copy > 0) {
  if (copy > len)
   copy = len;
  if (memcpy_toiovec(to, skb->data + offset, copy))
   goto fault;
  if ((len -= copy) == 0)
   return 0;
  offset += copy;
 }

 /* Copy paged appendix. Hmm... why does this look so complicated? */
 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  int end;

  WARN_ON(start > offset + len);

  end = start + skb_shinfo(skb)->frags[i].size;
  if ((copy = end - offset) > 0) {
   int err;
   u8  *vaddr;
   skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
   struct page *page = frag->page;

   if (copy > len)
    copy = len;
   vaddr = kmap(page);
   err = memcpy_toiovec(to, vaddr + frag->page_offset +
          offset - start, copy);
   kunmap(page);
   if (err)
    goto fault;
   if (!(len -= copy))
    return 0;
   offset += copy;
  }
  start = end;
 }

 skb_walk_frags(skb, frag_iter) {
  int end;

  WARN_ON(start > offset + len);

  end = start + frag_iter->len;
  if ((copy = end - offset) > 0) {
   if (copy > len)
    copy = len;
   if (skb_copy_datagram_iovec(frag_iter,
          offset - start,
          to, copy))
    goto fault;
   if ((len -= copy) == 0)
    return 0;
   offset += copy;
  }
  start = end;
 }
 if (!len)
  return 0;

fault:
 return -EFAULT;
}
7 內核態接收用戶態發送的數據
static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
      struct msghdr *msg, size_t len)
{
 struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
 struct sock *sk = sock->sk;
 struct netlink_sock *nlk = nlk_sk(sk);
 struct sockaddr_nl *addr = msg->msg_name;/ /目的地址信息
 u32 dst_pid;
 u32 dst_group;
 struct sk_buff *skb;
 int err;
 struct scm_cookie scm;

 if (msg->msg_flags&MSG_OOB)
  return -EOPNOTSUPP;

 if (NULL == siocb->scm)
  siocb->scm = &scm;
 err = scm_send(sock, msg, siocb->scm);
 if (err < 0)
  return err;

 if (msg->msg_namelen) {
  if (addr->nl_family != AF_NETLINK)
   return -EINVAL;
  dst_pid = addr->nl_pid;
  dst_group = ffs(addr->nl_groups);
  if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND))
   return -EPERM;
 } else {
  dst_pid = nlk->dst_pid;
  dst_group = nlk->dst_group;
 }

 if (!nlk->pid) {
  err = netlink_autobind(sock);
  if (err)
   goto out;
 }

 err = -EMSGSIZE;
 if (len > sk->sk_sndbuf - 32)
  goto out;
 err = -ENOBUFS;
 skb = alloc_skb(len, GFP_KERNEL); 分配一個sk_buff結構,將msghdr結構轉化為sk_buff結構
 if (skb == NULL)
  goto out;

 NETLINK_CB(skb).pid = nlk->pid;  //填寫本地的pid信息
 NETLINK_CB(skb).dst_group = dst_group;
 NETLINK_CB(skb).loginuid = audit_get_loginuid(current);
 NETLINK_CB(skb).sessionid = audit_get_sessionid(current);
 security_task_getsecid(current, &(NETLINK_CB(skb).sid));
 memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred));

 /* What can I do? Netlink is asynchronous, so that
    we will have to save current capabilities to
    check them, when this message will be delivered
    to corresponding kernel module.   --ANK (980802)
  */

 err = -EFAULT; 數據拷貝進sk_buff中
 if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
  kfree_skb(skb);
  goto out;
 }

 err = security_netlink_send(sk, skb);
 if (err) {
  kfree_skb(skb);
  goto out;
 }

 if (dst_group) {
  atomic_inc(&skb->users);
  netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL);
 }
 err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);

out:
 return err;
}

Copyright © Linux教程網 All Rights Reserved