您现在的位置： Linux教程網 >> UnixLinux > >> Linux基礎 >> 關於Linux

容器之namespace

1. 介紹

簡單玩了下Linux kernel為容器技術提供的基礎設施之一namespace(另一個是cgroups)，包括uts/user/pid/mnt/ipc/net六個(3.13.0的內核). 這東西主要用來做資源的隔離，我感覺本質上是全局資源的映射，映射之間獨立了自然隔離了。主要涉及到的東西是:

clone setns unshare /proc/pid/ns, /proc/pid/uid_map, /proc/pid/gid_map等

2. 測試流程及代碼

下面是一些簡單的例子，主要測試uts/pid/user/mnt四個namespace的效果，測試代碼主要用到三個進程，一個是clone系統調用執行/bin/bash後的進程，也是生成新的子namespace的初始進程，然後是打開/proc/pid/ns下的namespace鏈接文件，用setns將第二個可執行文件的進程加入/bin/bash的進程的namespace(容器)，並讓其fork出一個子進程，測試pid namespace的差異。值得注意的幾個點:

不同版本的內核setns和unshare對namespace的支持不一樣，較老的內核可能只支持ipc/net/uts三個namespace 某個進程創建後其pid namespace就固定了，使用setns和unshare改變後，其本身的pid namespace不會改變，只有fork出的子進程的pid namespace改變 setns將進程加入的新namespace需是此進程的後代namespace 用setns添加mnt namespace應該放在其他namespace之後，否則可能出現無法打開/proc/pid/ns/…的錯誤

// 代碼1: 開一些新的namespace(啟動新容器)
#define _GNU_SOURCE
#include 
#include 
#include 
#include 
#include 
#include 

#define errExit(msg)  do { perror(msg); exit(EXIT_FAILURE); \
} while (0)

/* Start function for cloned child */
static int childFunc(void *arg)
{
  const char *binary = "/bin/bash";
  char *const argv[] = {
    "/bin/bash",
    NULL
  };
  char *const envp[] = { NULL };

  /* wrappers for execve */
  // has const char * as argument list
  // execl 
  // execle  => has envp
  // execlp  => need search PATH 

  // has char *const arr[] as argument list 
  // execv 
  // execvpe => need search PATH and has envp
  // execvp  => need search PATH 

  //int ret = execve(binary, argv, envp);
  int ret = execv(binary, argv);
  if (ret < 0) {
    errExit("execve error");
  }
  return ret;
}

#define STACK_SIZE (1024 * 1024)    /* Stack size for cloned child */

int main(int argc, char *argv[])
{
  char *stack; 
  char *stackTop;                 
  pid_t pid;
  stack = malloc(STACK_SIZE);
  if (stack == NULL)
    errExit("malloc");
  stackTop = stack + STACK_SIZE;  /* Assume stack grows downward */

  //pid = clone(childFunc, stackTop, CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWUSER | SIGCHLD, NULL);
  pid = clone(childFunc, stackTop, CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWUSER | CLONE_NEWIPC | SIGCHLD, NULL);
//pid = clone(childFunc, stackTop, CLONE_NEWUTS | //CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWUSER | CLONE_NEWIPC //| CLONE_NEWNET | SIGCHLD, NULL);
  if (pid == -1)
    errExit("clone");
  printf("clone() returned %ld\n", (long) pid);

  if (waitpid(pid, NULL, 0) == -1)  
    errExit("waitpid");
  printf("child has terminated\n");

  exit(EXIT_SUCCESS);
}

// 代碼2: 使用setns加入新進程
#define _GNU_SOURCE  // ?
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

// mainly setns and unshare system calls

/* int setns(int fd, int nstype); */

// 不同版本內核/proc/pid/ns下namespace文件情況
/*
   CLONE_NEWCGROUP (since Linux 4.6)
   fd must refer to a cgroup namespace.

   CLONE_NEWIPC (since Linux 3.0)
   fd must refer to an IPC namespace.

   CLONE_NEWNET (since Linux 3.0)
   fd must refer to a network namespace.

   CLONE_NEWNS (since Linux 3.8)
   fd must refer to a mount namespace.

   CLONE_NEWPID (since Linux 3.8)
   fd must refer to a descendant PID namespace.

   CLONE_NEWUSER (since Linux 3.8)
   fd must refer to a user namespace.

   CLONE_NEWUTS (since Linux 3.0)
   fd must refer to a UTS namespace.
   */

/* // 特殊的pid namespace 
   CLONE_NEWPID behaves somewhat differently from the other nstype
values: reassociating the calling thread with a PID namespace changes
only the PID namespace that child processes of the caller will be
created in; it does not change the PID namespace of the caller
itself.  Reassociating with a PID namespace is allowed only if the
PID namespace specified by fd is a descendant (child, grandchild,
etc.)  of the PID namespace of the caller.  For further details on
PID namespaces, see pid_namespaces(7).
*/


/*
int unshare(int flags);
CLONE_FILES | CLONE_FS | CLONE_NEWCGROUP | CLONE_NEWIPC | CLONE_NEWNET 
| CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWUSER | CLONE_NEWUTS | CLONE_SYSVSEM
*/



#define MAX_PROCPATH_LEN 1024

#define errorExit(msg) \
  do { fprintf(stderr, "%s in file %s in line %d\n", msg, __FILE__, __LINE__);\
    exit(EXIT_FAILURE); } while (0)

void printInfo();
int openAndSetns(const char *path);

int main(int argc, char *argv[])
{
  if (argc < 2) {
    fprintf(stdout, "usage : execname pid(find namespaces of this process)\n");
    return 0;
  }
  printInfo();

  fprintf(stdout, "---- setns for uts ----\n");
  char uts[MAX_PROCPATH_LEN];
  snprintf(uts, MAX_PROCPATH_LEN, "/proc/%s/ns/uts", argv[1]);
  openAndSetns(uts);
  printInfo();

  fprintf(stdout, "---- setns for user ----\n");
  char user[MAX_PROCPATH_LEN];
  snprintf(user, MAX_PROCPATH_LEN, "/proc/%s/ns/user", argv[1]);
  openAndSetns(user);
  printInfo();

  // 注意pid namespace的不同行為，只有後續創建的子進程進入setns設置
  // 的新的pid namespace，本進程不會改變
  fprintf(stdout, "---- setns for pid ----\n");
  char pidpath[MAX_PROCPATH_LEN];
  snprintf(pidpath, MAX_PROCPATH_LEN, "/proc/%s/ns/pid", argv[1]);
  openAndSetns(pidpath);
  printInfo();


  fprintf(stdout, "---- setns for ipc ----\n");
  char ipc[MAX_PROCPATH_LEN];
  snprintf(ipc, MAX_PROCPATH_LEN, "/proc/%s/ns/ipc", argv[1]);
  openAndSetns(ipc);
  printInfo();

  fprintf(stdout, "---- setns for net ----\n");
  char net[MAX_PROCPATH_LEN];
  snprintf(net, MAX_PROCPATH_LEN, "/proc/%s/ns/net", argv[1]);
  openAndSetns(net);
  printInfo();

  // 注意mnt namespace需要放在其他後面，避免mnt namespace改變後
  // 找不到/proc/pid/ns下的文件
  fprintf(stdout, "---- setns for mount ----\n");
  char mount[MAX_PROCPATH_LEN];
  snprintf(mount, MAX_PROCPATH_LEN, "/proc/%s/ns/mnt", argv[1]);
  openAndSetns(mount);
  printInfo();

  // 測試子進程的pid namespace
  int ret = fork();
  if (-1 == ret) {
    errorExit("failed to fork");
  } else if (ret == 0) {
    fprintf(stdout, "********\n");
    fprintf(stdout, "in child process\n");
    printInfo();
    fprintf(stdout, "********\n");
    for (;;) {
      sleep(5);
    }
  } else {
    fprintf(stdout, "child pid : %d\n", ret);
  }
  for (;;) {
    sleep(5);
  }
  waitpid(ret, NULL, 0);
  return 0;
}

void printInfo()
{
  pid_t pid;
  struct utsname uts;
  uid_t uid;
  gid_t gid;
  // pid namespace 
  pid = getpid();
  // user namespace 
  uid = getuid();
  gid = getgid();
  // uts namespace 
  uname(&uts);
  fprintf(stdout, "pid : %d\n", pid);
  fprintf(stdout, "uid : %d\n", uid);
  fprintf(stdout, "gid : %d\n", gid);
  fprintf(stdout, "hostname : %s\n", uts.nodename);
}

int openAndSetns(const char *path)
{
  int ret = open(path, O_RDONLY, 0);
  if (-1 == ret) {
    fprintf(stderr, "%s\n", strerror(errno));
    errorExit("failed to open fd");
  }
  if (-1 == (ret = setns(ret, 0))) {
    fprintf(stderr, "%s\n", strerror(errno));
    errorExit("failed to setns");
  }
  return ret;
}

3. 測試效果

user的效果 : 通過/proc/pid/uid_map和/proc/pid/gid_map設置container外用戶id和容器內用戶id的映射關系(把這放前面是因為後面hostname和mount需要權限…)
這裡寫圖片描述

uts的效果 : 改變container中的hZ喎?http://www.2cto.com/kf/ware/vc/" target="_blank" class="keylink">vc3RuYW1lsru74dOwz+xjb250YWluZXLN4sPmtcRob3N0bmFtZTxiciAvPg0KPGltZyBhbHQ9"這裡寫圖片描述" src="http://www.2cto.com/uploadfile/Collfiles/20160611/20160611095436693.png" title="\" />

pid和mnt的效果 : container中進程id被重新映射，在container中重新掛載/proc filesystem不會影響容器外的/proc
這裡寫圖片描述

setns的測試

依次為init進程，container init進程(6個namespace的flag都指定了)，新加入container的進程以及其fork出的子進程的namespace情況，可以看到container init進程與init進程的namespace完全不同了，新加入container的進程除了pid與init相同外，其他namespace與container init進程相同，而新加入container的進程fork出的子進程的namespace則與container init進程完全相同
這裡寫圖片描述