0%

memcached浅析

安装

memcached是目前比较稳定的一个缓存中间件,主要用来通过缓存加快数据访问速度,减少数据库压力

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# terminal a
yum install automake libevent-devel -y
git clone https://github.com/memcached/memcached.git
cd memcached
./autogen.sh
make
yum install yum-utils
vim /etc/yum.repos.d/CentOS-Debuginfo.repo
# enable=0改成enable=1
debuginfo-install glibc-2.28-127.el8.x86_64 libevent-2.1.8-5.el8.x86_64 openssl-libs-1.1.1g-12.el8_3.x86_64 sssd-client-2.3.0-9.el8.x86_64 zlib-1.2.11-16.el8_2.x86_64
gdb --arg ./memcached-debug -uroot
b memcached.c:dispatch_bin_command
# terminal b
telnet 127.0.0.1 11211
set runoob 0 3600 9
memcached
get runoob

最后的几步,添加debuginfo包,耗费了一些时间,主要原因是enable没改成1

简单debug

memcached使用下面的数据结构存储key-value对,其中包含一个开链表的hash表的next指针,以及在LRU缓存中的prev和next指针

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
/**
* Structure for storing items within memcached.
*/
typedef struct _stritem {
/* 受LRU locks保护 */
struct _stritem *next;//LRU队列的下一个item
struct _stritem *prev;//LRU队列的前一个item
/* 剩下其他的受item lock保护 */
struct _stritem *h_next; /* 哈希链中的下一个item */
rel_time_t time; /* 上一次访问的时间 */
rel_time_t exptime; /* 过期时间 */
int nbytes; /* 数据的大小 */
unsigned short refcount; /* 这个item引用的次数 */
uint8_t nsuffix; /* flag和val_lenth的长度 */
uint8_t it_flags; /* ITEM_*标识 */
uint8_t slabs_clsid;/* 位于哪个slabclass中 */
uint8_t nkey; /* key的长度*/
union {
uint64_t cas;
char end;
} data[];
/* if it_flags & ITEM_CAS we have 8 bytes CAS */
/* then null-terminated key */
/* then " flags length\r\n" (no terminating null) */
/* then data with terminating \r\n (no terminating null; it's binary!) */
} item;

执行get操作,堆栈如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#0  assoc_find (key=key@entry=0x7ffff0010e54 "runoob", nkey=nkey@entry=6, hv=hv@entry=2211139334) at assoc.c:88
#1 0x000000000041d31c in do_item_get (key=key@entry=0x7ffff0010e54 "runoob", nkey=nkey@entry=6, hv=hv@entry=2211139334,
c=c@entry=0x7ffff0010c40, do_update=do_update@entry=true) at items.c:955
#2 0x0000000000421664 in item_get (key=key@entry=0x7ffff0010e54 "runoob", nkey=nkey@entry=6, c=c@entry=0x7ffff0010c40,
do_update=do_update@entry=true) at thread.c:564
#3 0x000000000040a55c in limited_get (should_touch=false, exptime=0, c=0x7ffff0010c40, nkey=6, key=<optimized out>)
at memcached.c:3545
#4 process_get_command (c=c@entry=0x7ffff0010c40, tokens=tokens@entry=0x7ffff530cc00, return_cas=return_cas@entry=false,
should_touch=should_touch@entry=false, ntokens=<optimized out>) at memcached.c:3839
#5 0x0000000000411706 in process_command (c=c@entry=0x7ffff0010c40, command=0x7ffff0010e50 "get") at memcached.c:4656
#6 0x0000000000414248 in try_read_command (c=0x7ffff0010c40) at memcached.c:5062
#7 drive_machine (c=0x7ffff0010c40) at memcached.c:5498
#8 0x00000000004161a1 in event_handler (fd=<optimized out>, which=<optimized out>, arg=0x7ffff0010c40) at memcached.c:5780
#9 0x00007ffff7950ff1 in event_persist_closure (ev=<optimized out>, base=0x669b10) at event.c:1580
#10 event_process_active_single_queue (base=base@entry=0x669b10, activeq=0x669f40,
max_to_process=max_to_process@entry=2147483647, endtime=endtime@entry=0x0) at event.c:1639
#11 0x00007ffff7951787 in event_process_active (base=0x669b10) at event.c:1738
#12 event_base_loop (base=0x669b10, flags=flags@entry=0) at event.c:1961
#13 0x0000000000420f37 in worker_libevent (arg=0x663560) at thread.c:387
#14 0x00007ffff771414a in start_thread (arg=<optimized out>) at pthread_create.c:479
#15 0x00007ffff7445f23 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

执行set操作,最终调用item.c中的函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
int do_item_link(item *it, const uint32_t hv) {
MEMCACHED_ITEM_LINK(ITEM_key(it), it->nkey, it->nbytes);
assert((it->it_flags & (ITEM_LINKED|ITEM_SLABBED)) == 0);
it->it_flags |= ITEM_LINKED;
it->time = current_time;

STATS_LOCK();
stats_state.curr_bytes += ITEM_ntotal(it);
stats_state.curr_items += 1;
stats.total_items += 1;
STATS_UNLOCK();

/* Allocate a new CAS ID on link. */
ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0);
assoc_insert(it, hv);
item_link_q(it);
refcount_incr(it);
item_stats_sizes_add(it);

return 1;
}

其中 assoc_insert 函数的功能是将item插入到hash表中,item_link_q 的功能是将 item 链到对应的 lru 链表中,大致如下图
memcached hash table and lru linked list
其中4种缓存类型(也叫做lru_segment)可以按照下面的状态机进行变化
memcached lru state machine
值得注意的是,需要开启 lru_maintainer_thread ,才会执行上述多segment的LRU策略,具体执行策略见 items.c:lru_maintainer_thread 方法
在日常分配内存的时候,会尝试从 COLD_LRU lru segment 中获取内存

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
item *do_item_alloc_pull(const size_t ntotal, const unsigned int id) {
item *it = NULL;
int i;
/* If no memory is available, attempt a direct LRU juggle/eviction */
/* This is a race in order to simplify lru_pull_tail; in cases where
* locked items are on the tail, you want them to fall out and cause
* occasional OOM's, rather than internally work around them.
* This also gives one fewer code path for slab alloc/free
*/
for (i = 0; i < 10; i++) {
uint64_t total_bytes;
/* Try to reclaim memory first */
if (!settings.lru_segmented) {
lru_pull_tail(id, COLD_LRU, 0, 0, 0, NULL);
}
it = slabs_alloc(ntotal, id, &total_bytes, 0);

if (settings.temp_lru)
total_bytes -= temp_lru_size(id);

if (it == NULL) {
// 使用 cold lru 末尾
if (lru_pull_tail(id, COLD_LRU, total_bytes, LRU_PULL_EVICT, 0, NULL) <= 0) {
if (settings.lru_segmented) {
// 使用 hot lru 末尾
lru_pull_tail(id, HOT_LRU, total_bytes, 0, 0, NULL);
} else {
break;
}
}
} else {
break;
}
}

if (i > 0) {
pthread_mutex_lock(&lru_locks[id]);
itemstats[id].direct_reclaims += i;
pthread_mutex_unlock(&lru_locks[id]);
}

return it;
}

核心思想

根据请求内存的大小,将请求对应到不同的 slabclass,起始大小是92Byte,最大是512kB,大小每次递增1.25倍,总共给64种slab大小,例如需要申请50Byte的数据的时候,会使用92Byte的这个slabclass,其结构如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];

typedef struct {
unsigned int size; /* sizes of items */
unsigned int perslab; /* how many items per slab */

void *slots; /* list of item ptrs */
unsigned int sl_curr; /* total free items in list */

unsigned int slabs; /* how many slabs were allocated for this class */

void **slab_list; /* array of slab pointers 这个 size class 的 slabclass_t 的链表*/
unsigned int list_size; /* size of prev array, slabclass_t 链表中前面的数量*/

size_t requested; /* The number of requested bytes */
} slabclass_t;

注意其中 slab_list 表示的是 slabclass_t 链表,因为单个 slabclass_t 是不能满足特定 slabsize 的内存请求的
slabclass会在启动的时候初始化,主要是初始化好 size 和 perslab

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
void slabs_init(const size_t limit, const double factor, const bool prealloc, const uint32_t *slab_sizes) {
// ...
int i = POWER_SMALLEST - 1;

mem_limit = limit;

while (++i < MAX_NUMBER_OF_SLAB_CLASSES-1) {
if (slab_sizes != NULL) {
if (slab_sizes[i-1] == 0)
break;
size = slab_sizes[i-1];
} else if (size >= settings.slab_chunk_size_max / factor) {
break;
}
/* Make sure items are always n-byte aligned */
if (size % CHUNK_ALIGN_BYTES)
size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);

slabclass[i].size = size;
slabclass[i].perslab = settings.slab_page_size / slabclass[i].size;
if (slab_sizes == NULL)
size *= factor;
if (settings.verbose > 1) {
fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
i, slabclass[i].size, slabclass[i].perslab);
}
}

power_largest = i;
slabclass[power_largest].size = settings.slab_chunk_size_max;
slabclass[power_largest].perslab = settings.slab_page_size / settings.slab_chunk_size_max;
// ...
}

实际在需要分配内存给 item 的时候,会先 malloc 一个 page, page 大小为1M,申请到之后会将它按照 size class 的大小初始化放到 slabclass 的 slots 链表中,后续使用直接从里面拿

1
2
3
4
5
6
7
8
static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
slabclass_t *p = &slabclass[id];
int x;
for (x = 0; x < p->perslab; x++) {
do_slabs_free(ptr, 0, id);
ptr += p->size;
}
}

slabclass

压测对比

使用redis作者编写的一个benchmark脚本,将redis和memcached的benchmark脚本的差异最小化

1
2
./redis-benchmark -t set,get -n 1000000 -r 10000000000000000 -d 16
./mc-benchmark -n 1000000 -r 10000000000000000 -d 16

直接用本机的client,做benchmark对比了一下,redis的读写速度会稍微快一点
set操作,redis跑了27.90s,pp5是2ms左右,最大时间13ms;memcached跑了32.35s,p995是5ms,最大时间41ms。
get操作,redis跑了29.74s,pp5是2ms左右,最大时间13ms;memcached跑了32.25s,p995是4ms,最大时间15ms。

总结

  1. 代码依赖的是动态库,不像redis直接把依赖都放到dep,直接make就好了,编译起来麻烦
  2. 按照slab方式,自己管理内存,一次申请一个page,这些本来是动态链接库做的事情,应用程序做了,会存一些内存碎片
  3. 哈希算法使用murmur_hash
  4. memcached只支持纯key-value,同时不落磁盘(不像redis有rbd和aof),没有集群概念
  5. 因为是多线程,代码中到处都是mutex
  6. socket连接使用状态机管理,逻辑更加清晰
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    enum conn_states {
    conn_listening, /**< the socket which listens for connections */
    conn_new_cmd, /**< Prepare connection for next command */
    conn_waiting, /**< waiting for a readable socket */
    conn_read, /**< reading in a command line */
    conn_parse_cmd, /**< try to parse a command from the input buffer */
    conn_write, /**< writing out a simple response */
    conn_nread, /**< reading in a fixed number of bytes */
    conn_swallow, /**< swallowing unnecessary bytes w/o storing */
    conn_closing, /**< closing this connection */
    conn_mwrite, /**< writing out many items sequentially */
    conn_closed, /**< connection is closed */
    conn_watch, /**< held by the logger thread as a watcher */
    conn_max_state /**< Max state value (used for assertion) */
    };
  7. LRU缓存机制,详见 item.c:lru_pull_tail ,lru crawler机制代码较多,暂时不讨论了,好像需要设置 lru_maintainer_thread 才会开启分冷热类型的LRU
  8. 每次插入都将新插入的数据放在第一个位置
    1
    int assoc_insert(item *it, const uint32_t hv)

Reference

  1. memcached
  2. memcached benchmark
  3. mc-benchmark
  4. Redis vs Memcached
  5. Replacing the cache replacement algorithm in memcached
  6. memcached commands
  7. [渐进式解析 memcached 源码 - LRU、ASSOC扩容 机制](http://www.web-lovers.com/memcached-source-lru-and-assoc-expand.html
  8. memcached源码分析
  9. Memcached源码分析
  10. memcache源码分析之item
如果您觉得这些内容对您有帮助,你可以赞助我以提高站点的文章质量