0% found this document useful (0 votes)
6 views32 pages

Redis源代码分析

The document provides an overview of the architecture and components of Redis, a key-value store. It details various data structures such as lists, strings, and dictionaries, along with their associated functions and memory management techniques. Additionally, it covers event handling mechanisms within Redis, including file and time events.

Uploaded by

bocerin283
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
6 views32 pages

Redis源代码分析

The document provides an overview of the architecture and components of Redis, a key-value store. It details various data structures such as lists, strings, and dictionaries, along with their associated functions and memory management techniques. Additionally, it covers event handling mechanisms within Redis, including file and time events.

Uploaded by

bocerin283
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 32

Redis · Arch_Platform

Redis

(huwu)

(yuhanzou)

Jun 17, 2011

1 / 32
Redis · Arch_Platform

Jun 17, 2011

2 / 32
Redis · Arch_Platform

1. Redis ! 4

2. ! 4
2.1. (adlist.h/adlist.c)! 4

2.2. (sds.h/sds.c)! 5

2.3. (dict.h/dict.c)! 6

2.4. (zmalloc.h/zmalloc.h)! 9

3. ! 11
3.1. (ae.h/ae.c)! 11

3.2. (anet.h/anet.c)! 15

3.3. (networking.h/networking.c, redis.c/redis.h)! 15

3.4. ! 17

4. ! 19
4.1. ! 20

4.2. ! 23

5. ! 24
5.1. Snapshot! 24

5.2. AOF! 26

6. ! 27
6.1. ! 27

6.2. ! 30

6.3. ! 31

3 / 32
Redis · Arch_Platform

1. Redis
Redis(http://redis.io) Key-Value string
hash list set sorted set

2.

2.1. (adlist.h/adlist.c)
(list) Redis adlist.h adlist.c

listNode (next)
(prev) void*
typedef struct listNode {
struct listNode *prev;
struct listNode *next;
void *value;
} listNode;
list (head) (tail)
(dup) (free)
(match) (value) len
typedef struct list {
listNode *head;
listNode *tail;
void *(*dup)(void *ptr);
void (*free)(void *ptr);
int (*match)(void *ptr, void *key);
unsigned int len;
} list;
listIter (next) direction
( AL_START_HEAD AL_START_TAIL
typedef struct listIter {
listNode *next;
int direction;
} listIter;

Redis list
(listCreate) Redis zmalloc()
(listRelease) (listDelNode) (list)
free Redis (value)
Redis zfree()
listGetIterator() listNext()
listReleaseIterator() listRewind() listRewindTail()
list
iter = listGetIterator(list, AL_START_HEAD); //
4 / 32
Redis · Arch_Platform

while((node = listNext(iter)) != NULL) {


DoItWithValue(listNodeValue(node)); // DoItWithValue
}
listReleaseIterator(iter);
listDup() dup
value listSearchKey() O(N)
match

2.2. (sds.h/sds.c)
Redis Redis key value
key Redis (Binary Safe)
256 (8bit)
[2] Redis value key

Redis sds.h sds.c


sds char*
typedef char *sds;
sdsnewlen() Redis sdshdr
len free sdshdr
sizeof(int)+sizeof(int)+sizeof(char*)+len+free
struct sdshdr {
int len;
int free;
char buf[];
};
sdsnewlen(const void *init, size_t initlen) sizeof
(struct sdshdr)+initlen+1 *init buf buf
’\0’ sdsnewlen() sdshdr->buf
sdsnewlen(“qqmail”, 6) 64 Redis 24
6+1

6 0 qqmail

sdshdr->buf ”qqmail” buf


sdshdr sdslen()
size_t sdslen(const sds s) {
struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr)));
return sh->len;
}
(s-(sizeof(struct sdshdr)) sdshdr

sdshdr Redis buf


sds char*

5 / 32
Redis · Arch_Platform

sds
sds sdsnewlen(const void *init, size_t initlen);
sds sdsnew(const char *init);
sds sdsempty();
zmalloc() sdsempty() void
sdsfree(sds s) size_t sdslen(const sds s)
size_t sdsavail(sds s)
sds
sds sdsgrowzero(sds s, size_t len);
sds sdscatlen(sds s, void *t, size_t len);
sds sdscat(sds s, char *t);
sds sdscpylen(sds s, char *t, size_t len);
sds sdscpy(sds s, char *t);
sdsgrowzero() sds ’\0’ free
sdscat() t s sdscpy() t s s
sdsgrowzero()
sds

2.3. (dict.h/dict.c)
Redis 0/1

dict type
dictht ht[2] rehashidx
iterators
typedef struct dict {
dictType *type;
void *privdata;
dictht ht[2];
int rehashidx; /* rehashing not in progress if rehashidx == -1 */
int iterators; /* number of iterators currently running */
} dict;
dictht table
size 2 sizemark size-1
size used
typedef struct dictht {
dictEntry **table;
unsigned long size;
unsigned long sizemask;
unsigned long used;
} dictht;
dictType
Hash Key Value
Key Key Value
type
typedef struct dictType {
unsigned int (*hashFunction)(const void *key);
void *(*keyDup)(void *privdata, const void *key);

6 / 32
Redis · Arch_Platform

void *(*valDup)(void *privdata, const void *obj);


int (*keyCompare)(void *privdata, const void *key1, const void *key2);
void (*keyDestructor)(void *privdata, void *key);
void (*valDestructor)(void *privdata, void *obj);
} dictType;
extern dictType dictTypeHeapStringCopyKey;
extern dictType dictTypeHeapStrings;
extern dictType dictTypeHeapStringCopyKeyValue;
Redis Value
API

Redis dict_can_resize
dictEnableResize() dictDisableResize()
dictResize()
dictExpand()
dictRehashMilliseconds()
dictAdd()
_dictExpandIfNeeded() _dictExpandIfNeeded
size 0 0
used>=size can_resize==1 used/size 5
max(used, size) dictExpand()
if (dictIsRehashing(d)) return DICT_OK;
if (d->ht[0].size == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE);

if (d->ht[0].used >= d->ht[0].size &&


(dict_can_resize ||
d->ht[0].used/d->ht[0].size > dict_force_resize_ratio))
{
return dictExpand(d, ((d->ht[0].size > d->ht[0].used) ?
d->ht[0].size : d->ht[0].used)*2);
}
return DICT_OK;
dictExpand() size 2
ht[0] ht[0]
ht[1] ht[0] ht[1]
rehashidx=0 ht[0]->table[rehashidx]
dictht n; /* the new hashtable */
unsigned long realsize = _dictNextPower(size);

if (dictIsRehashing(d) || d->ht[0].used > size)


return DICT_ERR;

n.size = realsize;
n.sizemask = realsize-1;
n.table = zcalloc(realsize*sizeof(dictEntry*));
n.used = 0;

if (d->ht[0].table == NULL) {
d->ht[0] = n;

7 / 32
Redis · Arch_Platform

return DICT_OK;
}

d->ht[1] = n;
d->rehashidx = 0;
return DICT_OK;
dictIsRehashing() rehashidx
dictAdd()
_dictRehashStep() iterators==0
ht[0] ht[1]
dictRehash() 0 ht[0]
ht[1] rehashidx -1 1
ht[0]->table[rehashidx] ht[1] rehashidx++
used==0
ht[1] dictRehash()

int dictRehash(dict *d, int n) {


if (!dictIsRehashing(d)) return 0;

while(n--) {
dictEntry *de, *nextde;

if (d->ht[0].used == 0) {
zfree(d->ht[0].table);
d->ht[0] = d->ht[1];
_dictReset(&d->ht[1]);
d->rehashidx = -1;
return 0;
}

while(d->ht[0].table[d->rehashidx] == NULL) d->rehashidx++;


de = d->ht[0].table[d->rehashidx];

while(de) {
unsigned int h;

nextde = de->next;
/* Get the index in the new hash table */
h = dictHashKey(d, de->key) & d->ht[1].sizemask;
de->next = d->ht[1].table[h];
d->ht[1].table[h] = de;
d->ht[0].used--;
d->ht[1].used++;
de = nextde;
}
d->ht[0].table[d->rehashidx] = NULL;
d->rehashidx++;
}
return 1;
}

8 / 32
Redis · Arch_Platform

dictGetIterator()
dictNext(dictIterator *)
0
ht[0] table[0]
table[1], table[2], ..., table[size]
ht[1]

2.4. (zmalloc.h/zmalloc.h)
Redis , zmalloc(),
zrealloc() zcalloc() zfree(), C malloc(), realloc()
calloc() free() zmalloc.h zmalloc.c
Redis
Redis (VM)
swap

malloc()
NULL
calloc() malloc()
NULL calloc() count size
calloc() 0
realloc()

NULL
free()

Redis TCMALLOC tc_malloc() TCMALLOC


malloc() TCmalloc google perftools

#if defined(USE_TCMALLOC)
#define malloc(size) tc_malloc(size)
zmalloc()zfree() Redis
size (PREFIX_SIZE)
Redis Redis
update_zmalloc_stat_alloc(size+PREFIX_SIZE, size)
zfree()
update_zmalloc_stat_free()
malloc_size()
Mac OS X 10.4 [3] malloc_size()
PREFIX_SIZE

9 / 32
Redis · Arch_Platform

void *zmalloc(size_t size) {


void *ptr = malloc(size+PREFIX_SIZE);

if (!ptr) zmalloc_oom(size);
#ifdef HAVE_MALLOC_SIZE
update_zmalloc_stat_alloc(redis_malloc_size(ptr),size);
return ptr;
#else
*((size_t*)ptr) = size; //
update_zmalloc_stat_alloc(size+PREFIX_SIZE,size);
return (char*)ptr+PREFIX_SIZE;
#endif
}
update_zmalloc_stat_alloc()
zmalloc_thread_safe
Redis thread_safe (VM)
dump

used_memory Redis malloc()


zmalloc_allocations[] size 256
256 zmalloc_allocations_for_size(size) size
zmalloc_used_memory() Redis
Redis
#define update_zmalloc_stat_alloc(__n,__size) do { \
size_t _n = (__n); \
size_t _stat_slot = (__size < ZMALLOC_MAX_ALLOC_STAT) ? __size :
ZMALLOC_MAX_ALLOC_STAT; \
if (_n&(sizeof(long)-1)) _n += sizeof(long)-(_n&(sizeof(long)-1)); \
if (zmalloc_thread_safe) { \
pthread_mutex_lock(&used_memory_mutex); \
used_memory += _n; \
zmalloc_allocations[_stat_slot]++; \
pthread_mutex_unlock(&used_memory_mutex); \
} else { \
used_memory += _n; \
zmalloc_allocations[_stat_slot]++; \
} \
} while(0)
zmalloc_enable_thread_safeness() Redis

zcalloc(size) zrealloc() zmalloc()


Redis zmalloc_get_rss()
Redis used_memory

Redis zstrdup(char
*) memcpy()

10 / 32
Redis · Arch_Platform

3.

3.1. (ae.h/ae.c)
Redis ( )

Redis ae_epoll.h/
ae_epoll.c epoll ae_select.h/ae_select.c select
ae_kqueue.h/ae_kqueue.c bsd kqueue

Redis FileEvent TimeEvent


(epoll/kqueue) Redis
aeEventLoop
typedef struct aeEventLoop {
int maxfd;
long long timeEventNextId;
aeFileEvent events[AE_SETSIZE]; /* Registered events */
aeFiredEvent fired[AE_SETSIZE]; /* Fired events */
aeTimeEvent *timeEventHead;
int stop;
void *apidata; /* This is used for polling API specific data */
aeBeforeSleepProc *beforesleep;
} aeEventLoop;
maxfd events fired
FileEvent AE_SETSIZE Redis 1024*10
timeEventHead TimeEvent timeEventNextId
beforesleep hook stop

typedef struct aeFileEvent {


int mask; /* one of AE_(READABLE|WRITABLE) */
aeFileProc *rfileProc;
aeFileProc *wfileProc;
void *clientData;
} aeFileEvent;
aeFileEvent socket mask
rfileProc wfileProc clientData
typedef struct aeTimeEvent {
long long id; /* time event identifier. */
long when_sec; /* seconds */
long when_ms; /* milliseconds */
aeTimeProc *timeProc;
aeEventFinalizerProc *finalizerProc;
void *clientData;
struct aeTimeEvent *next;
} aeTimeEvent;
aeTimeEvent (timer)
timer id when_sec when_ms timeProc
finalizerProc “ ”
11 / 32
Redis · Arch_Platform

aeFiredEvent FileEvent fd mask

Redis API aeCreateEventLoop()


aeDeleteEventLoop(aeEventLoop*)
eventLoop Linux ae_epoll.c
aeApiCreate() Redis
AE_NONE AE_READABLE AE_WRITABLE FileEvent
eventLoop events AE_NONE aeStop(aeEventLoop*)

aeCreateFileEvent() aeDeleteFileEvent() FileEvent


Linux epoll EPOLLIN EPOLLOUT
Read Write eventLoop->apiData
aeCreateTimeEvent() aeDeleteTimeEvent() TimeEvent
aeMain(aeEventLoop*)
eventLoop->beforesleep() aeProcessEvents()
aeProcessEvents() timer
timer FileEvent Redis
(Linux:ae_epoll.c) aeApiPoll() FileEvent
mask (rfileProc/wfileProc) FileEvent
TimeEvent TimerEvent FileEvent
FileEvent processTimeEvents() TimeEvent
timer timeProc timeProc timer
timer timeProc 100 timer 100ms
100ms timer timeProc 100 timer 100ms

...
retval = te->timeProc(eventLoop, id, te->clientData);
processed++;
if (retval != AE_NOMORE) {
aeAddMillisecondsToNow(retval,&te->when_sec,&te->when_ms);
} else {
aeDeleteTimeEvent(eventLoop, id);
}
...
Redis API aeWait
(int fd, int mask, long long mileseconds) fd
mask mileseconds select

Redis aeSetBeforeSleepProc()

Redis int main() initServer()


initServer() TimeEvent
12 / 32
Redis · Arch_Platform

serverCron 100ms Redis


aeMain()
int initServer() {
// ...
server.el = aeCreateEventLoop();
// ...
}
int main(int argc, char **argv) {
// ...
initServer();
// ....
aeSetBeforeSleepProc(server.el,beforeSleep);
aeMain(server.el);
aeDeleteEventLoop(server.el);
}
TimeEvent FileEvent BeforeSleepProc
Redis beforeSleep() ready
aof
serverCron() Redis Redis
server.unixtime Redis
time(NULL)
server.lruclock
/* We take a cached value of the unix time in the global state because
* with virtual memory and aging there is to store the current time
* in objects at every object access, and accuracy is not needed.
* To access a global var is faster than calling time(NULL) */
server.unixtime = time(NULL);
/* We have just 22 bits per object for LRU information.
* So we use an (eventually wrapping) LRU clock with 10 seconds resolution.
* 2^22 bits with 10 seconds resoluton is more or less 1.5 years.
*
* Note that even if this will wrap after 1.5 years it's not a problem,
* everything will still work but just some object will appear younger
* to Redis. But for this to happen a given object should never be touched
* for 1.5 years.
*
* Note that you can change the resolution altering the
* REDIS_LRU_CLOCK_RESOLUTION define.
*/
updateLRUClock();
SIGTERM

/* We don't want to resize the hash tables while a bacground saving


* is in progress: the saving child is created using fork() that is
* implemented with a copy-on-write semantic in most modern systems, so
* if we resize the HT while there is the saving child at work actually
* a lot of memory movements in the parent will cause a lot of pages
* copied. */
if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
if (!(loops % 10)) tryResizeHashTables();
if (server.activerehashing) incrementallyRehash();
}

13 / 32
Redis · Arch_Platform

/* Check if a background saving or AOF rewrite in progress terminated */


if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
int statloc;
pid_t pid;

if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
if (pid == server.bgsavechildpid) {
backgroundSaveDoneHandler(statloc);
} else {
backgroundRewriteDoneHandler(statloc);
}
updateDictResizePolicy();
}
} else {
/* If there is not a background saving in progress check if
* we have to save now */
time_t now = time(NULL);
for (j = 0; j < server.saveparamslen; j++) {
struct saveparam *sp = server.saveparams+j;

if (server.dirty >= sp->changes &&


now-server.lastsave > sp->seconds) {
redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
sp->changes, sp->seconds);
rdbSaveBackground(server.dbfilename);
break;
}
}
}
master key slave DEL
/* Expire a few keys per cycle, only if this is a master.
* On slaves we wait for DEL operations synthesized by the master
* in order to guarantee a strict consistency. */
if (server.masterhost == NULL) activeExpireCycle();
Entity
Redis server.vm_max_threads

/* Swap a few keys on disk if we are over the memory limit and VM
* is enbled. Try to free objects from the free list first. */
if (vmCanSwapOut()) {
while (server.vm_enabled && zmalloc_used_memory() >
server.vm_max_memory)
{
int retval = (server.vm_max_threads == 0) ?
vmSwapOneObjectBlocking() :
vmSwapOneObjectThreaded();
if (retval == REDIS_ERR && !(loops % 300) &&
zmalloc_used_memory() >
(server.vm_max_memory+server.vm_max_memory/10))
{
redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by
more than 10%% but unable to swap more objects out!");
}
14 / 32
Redis · Arch_Platform

/* Note that when using threade I/O we free just one object,
* because anyway when the I/O thread in charge to swap this
* object out will finish, the handler of completed jobs
* will try to swap more objects if we are still out of memory. */
if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
}
}
slave 10 serverCron 1 master

/* Replication cron function -- used to reconnect to master and


* to detect transfer failures. */
if (!(loops % 10)) replicationCron();
// Todo slave

3.2. (anet.h/anet.c)
Redis anet.h/c :
int anetTcpConnect(char *err, char *addr, int port);
int anetTcpNonBlockConnect(char *err, char *addr, int port);
int anetUnixConnect(char *err, char *path);
int anetUnixNonBlockConnect(char *err, char *path);
int anetRead(int fd, char *buf, int count);
int anetResolve(char *err, char *host, char *ipbuf);
int anetTcpServer(char *err, int port, char *bindaddr);
int anetUnixServer(char *err, char *path);
int anetTcpAccept(char *err, int serversock, char *ip, int *port);
int anetUnixAccept(char *err, int serversock);
int anetWrite(int fd, char *buf, int count);
int anetNonBlock(char *err, int fd);
int anetTcpNoDelay(char *err, int fd);
int anetTcpKeepAlive(char *err, int fd);
anetTcpServer() socket() bind()
listen() socket fd anetUnixServer()

anetTcpConnect() anetTcpNonBlockConnect()
anetUnixConnect() anetUnixNonBlockConnect()

anetNonBlock() fd anetTcpNoDelay() TCP


Nagle anetTcpKeepAlive()
fd
anetRead() anetWrite() anetTcpAccept()
anetUnixAccept()
Redis API

3.3. (networking.h/networking.c, redis.c/redis.h)


Redis
Redis

15 / 32
Redis · Arch_Platform

main() initServerConfig()
command table(server.commands)
Redis dict Hash command
table commandTableDictType command dict
populateCommand() Redis redisCommand
Redis command table
main() initServer() anetTcpServer
()
initServer() {
// ....
server.ipfd = anetTcpServer(server.neterr,server.port,server.bindaddr);
// ...
if (server.ipfd > 0 && aeCreateFileEvent(server.el,server.ipfd,AE_READABLE,
acceptTcpHandler,NULL) == AE_ERR) oom("creating file event");
// ...
}
Redis aeCreateFileEvent()
READABLE acceptTcpHandler()
READABLE anetTcpAccept() fd fd
createClient() redisClient
fd buf buf Redis
epoll
Redis
Redis
Redis
Redis redisClient(redis.h)
createClient() freeClient()
createClient() fd READABLE
readQueryFromClient()
server.clients
/* With multiplexing we need to take per-clinet state.
* Clients are taken in a liked list. */
typedef struct redisClient {
int fd;
redisDb *db;
int dictid;
sds querybuf;
int argc;
robj **argv;
int reqtype;
int multibulklen; /* number of multi bulk arguments left to read */
long bulklen; /* length of bulk argument in multi bulk request */
list *reply;
int sentlen;
time_t lastinteraction; /* time of the last interaction, used for timeout
*/
int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ...
0 */

16 / 32
Redis · Arch_Platform

int slaveseldb; /* slave selected db, if this client is a slave */


int authenticated; /* when requirepass is non-NULL */
int replstate; /* replication state if this is a slave */
int repldbfd; /* replication DB file descriptor */
long repldboff; /* replication DB file offset */
off_t repldbsize; /* replication DB file size */
multiState mstate; /* MULTI/EXEC state */
blockingState bpop; /* blocking state */
list *io_keys; /* Keys this client is waiting to be loaded from the
* swap file in order to continue. */
list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */

/* Response buffer */
int bufpos;
char buf[REDIS_REPLY_CHUNK_BYTES];
} redisClient;
freeClient()
(server.maxclients)
Redis
REDIS_BLOCKED REDIS_IO_WAIT
REDIS_CLOSE_AFTER_REPLY

3.4.
Redis redisCommand
typedef void redisCommandProc(redisClient *c);
typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int
argc, robj **argv);
struct redisCommand {
char *name;
redisCommandProc *proc;
int arity;
int flags;

redisVmPreloadProc *vm_preload_proc;

int vm_firstkey;
int vm_lastkey;
int vm_keystep;
};
Redis readQueryFromClient() Input
REDIS_IOBUF_LEN 1024 redisClient
querybuf sdscatlen(c->querybuf,buf,nread)
processInputBuffer()
ready flags REDIS_BLOCKED REDIS_IO_WAIT
REDIS_CLOSE_AFTER_REPLY reqtype
querybuf[0]==’*’
MULTIBULK INLINE
(resetClient())

17 / 32
Redis · Arch_Platform

void processInputBuffer(redisClient *c) {


while(sdslen(c->querybuf)) {
if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
if (c->flags & REDIS_CLOSE_AFTER_REPLY) return;

if (!c->reqtype) {
if (c->querybuf[0] == '*') {
c->reqtype = REDIS_REQ_MULTIBULK;
} else {
c->reqtype = REDIS_REQ_INLINE;
}
}

if (c->reqtype == REDIS_REQ_INLINE) {
if (processInlineBuffer(c) != REDIS_OK) break;
} else if (c->reqtype == REDIS_REQ_MULTIBULK) {
if (processMultibulkBuffer(c) != REDIS_OK) break;
} else {
redisPanic("Unknown request type");
}

if (c->argc == 0) {
resetClient(c);
} else {
if (processCommand(c) == REDIS_OK)
resetClient(c);
}
}
}
INLINE (‘\r\n’)
processInlineBuffer() argc/argv argv
redisObject
MULTIBULK //TODO
processCommand() ”quit” Redis
lookupCommand() command table
(cmd->proc())
‘get’ readonlyCommandTable initServerConfig
() command table get
lookupCommand() redisCommand
getCommand()
struct redisCommand readonlyCommandTable[] = {
{"get",getCommand,2,0,NULL,1,1,1},
// ....
}
redis t_***.c t_string.c
getCommand()

18 / 32
Redis · Arch_Platform

4.
Redis Redis Redis
Redis 2.0
value
vm vm
Redis
Redis redisObject
/* The actual Redis Object */
#define REDIS_LRU_CLOCK_MAX ((1<<21)-1) /* Max value of obj->lru */
#define REDIS_LRU_CLOCK_RESOLUTION 10 /* LRU clock resolution in seconds */
typedef struct redisObject {
unsigned type:4;
unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
unsigned encoding:4;
unsigned lru:22; /* lru time (relative to server.lruclock) */
int refcount;
void *ptr;
/* VM fields are only allocated if VM is active, otherwise the
* object allocation function will just allocate
* sizeof(redisObjct) minus sizeof(redisObjectVM), so using
* Redis without VM active will not have any overhead. */
} robj;
type value string list set storage
Redis
encoding lru server.lruclock refcount

#define REDIS_VM_MEMORY 0 /* The object is on memory */


#define REDIS_VM_SWAPPED 1 /* The object is on disk */
#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
Redis vmPointer
/* The VM pointer structure - identifies an object in the swap file.
*
* This object is stored in place of the value
* object in the main key->value hash table representing a database.
* Note that the first fields (type, storage) are the same as the redisObject
* structure so that vmPointer strucuters can be accessed even when casted
* as redisObject structures.
*
* This is useful as we don't know if a value object is or not on disk, but we
* are always able to read obj->storage to check this. For vmPointer
* structures "type" is set to REDIS_VMPOINTER (even if without this field
* is still possible to check the kind of object from the value of 'storage').*/
typedef struct vmPointer {
unsigned type:4;
unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
unsigned notused:26;
unsigned int vtype; /* type of the object stored in the swap file */
off_t page; /* the page at witch the object is stored on disk */
off_t usedpages; /* number of pages used on disk */
} vmpointer;

19 / 32
Redis · Arch_Platform

vmpointer 32 redisObject
vmPointer
off_t vm_page_size;
off_t vm_pages;
off_t vm_next_page; /* Next probably empty page */
off_t vm_near_pages; /* Number of pages allocated sequentially */
unsigned char *vm_bitmap; /* Bitmap of free/used pages */
Redis (Page) Redis
server.vm_bitmap 0 1 Redis

/* An I/O thread process an element taken from the io_jobs queue and
* put the result of the operation in the io_done list. While the
* job is being processed, it's put on io_processing queue. */
list *io_newjobs; /* List of VM I/O jobs yet to be processed */
list *io_processing; /* List of VM I/O jobs being processed */
list *io_processed; /* List of VM I/O jobs already processed */
Redis VM (job) io_newjobs
job processing processed job Job 3
LOAD PREPARE_SWAP DO_SWAP

/* VM threaded I/O request message */


#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
typedef struct iojob {
int type; /* Request type, REDIS_IOJOB_* */
redisDb *db;/* Redis database */
robj *key; /* This I/O request is about swapping this key */
robj *id; /* Unique identifier of this job:
this is the object to swap for REDIS_IOREQ_*_SWAP, or the
vmpointer objct for REDIS_IOREQ_LOAD. */
robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
* field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
off_t page; /* Swap page where to read/write the object */
off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
int canceled; /* True if this command was canceled by blocking side of VM */
pthread_t thread; /* ID of the thread processing this entry */
} iojob;
job Job queueIOJob()
server.io_newjobs Redis VM VM
job IOThreadEntryPoint job job
server.io_newjobs server.io_processing Redis
VM job Job

4.1.
getCommand() Redis Redis lookupKey()
dictEntity
dictEntity
dictEntity (REDIS_VM_MEMORY)
20 / 32
Redis · Arch_Platform

(REDIS_VM_SWAPPING) vmLoadObject()
dictEntity vmLoadObject()
processCommand()
if (server.vm_enabled && server.vm_max_threads > 0 &&
blockClientOnSwappedKeys(c,cmd)) return REDIS_ERR;
call(c,cmd);
blockClienkOnSwappedKeys
()
REDIS_IO_WAIT blockClienkOnSwappedKeys()
(vm_preload_proc)
waitForMultipleSwappedKeys() key
int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
if (cmd->vm_preload_proc != NULL) {
cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
} else {
waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
}

/* If the client was blocked for at least one key, mark it as blocked. */
if (listLength(c->io_keys)) {
c->flags |= REDIS_IO_WAIT;
aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
server.vm_blocked_clients++;
return 1;
} else {
return 0;
}
}
waitForMultipleSwappedKeys
waitForSwappedKey() cmd
dictEntity REDIS_VM_MEMORY
REDIS_VM_SWAPPING vmCancelThreadedIOJob(o)

c->io_keys swapping key


cmd key c->io_keys {c, key}
c->db->io_keys db server->db key

key job
job job vmpoiner dictEntity
int waitForSwappedKey(redisClient *c, robj *key) {
struct dictEntry *de;
robj *o;
list *l;

/* If the key does not exist or is already in RAM we don't need to


* block the client at all. */
de = dictFind(c->db->dict,key->ptr);
if (de == NULL) return 0;
o = dictGetEntryVal(de);
if (o->storage == REDIS_VM_MEMORY) {

21 / 32
Redis · Arch_Platform

return 0;
} else if (o->storage == REDIS_VM_SWAPPING) {
/* We were swapping the key, undo it! */
vmCancelThreadedIOJob(o);
return 0;
}

/* OK: the key is either swapped, or being loaded just now. */

/* Add the key to the list of keys this client is waiting for.
* This maps clients to keys they are waiting for. */
listAddNodeTail(c->io_keys,key);
incrRefCount(key);

/* Add the client to the swapped keys => clients waiting map. */
de = dictFind(c->db->io_keys,key);
if (de == NULL) {
int retval;

/* For every key we take a list of clients blocked for it */


l = listCreate();
retval = dictAdd(c->db->io_keys,key,l);
incrRefCount(key);
redisAssert(retval == DICT_OK);
} else {
l = dictGetEntryVal(de);
}
listAddNodeTail(l,c);

/* Are we already loading the key from disk? If not create a job */
if (o->storage == REDIS_VM_SWAPPED) {
iojob *j;
vmpointer *vp = (vmpointer*)o;

o->storage = REDIS_VM_LOADING;
j = zmalloc(sizeof(*j));
j->type = REDIS_IOJOB_LOAD;
j->db = c->db;
j->id = (robj*)vp;
j->key = key;
incrRefCount(key);
j->page = vp->page;
j->val = NULL;
j->canceled = 0;
j->thread = (pthread_t) -1;
lockThreadedIO();
queueIOJob(j);
unlockThreadedIO();
}
return 1;
}
vm job queueIOJob
job server->io_newjobs IO
IO Job
void queueIOJob(iojob *j) {
redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
(void*)j, j->type, (char*)j->key->ptr);
listAddNodeTail(server.io_newjobs,j);

22 / 32
Redis · Arch_Platform

if (server.io_active_threads < server.vm_max_threads)


spawnIOThread();
}
IO IOThreadEntryPoint() server-
>io_newjobs job server->io_newjobs server-
>processing job REDIS_IOJOB_LOAD job
vmReadObjectFromSwap() job server-
>processed server->io_ready_pipe_write
server->processed job job io_ready_pipe

server->io_ready_pipe_read server->io_ready_pipe_write
vmInit() io_ready_pipe_read
vmThreadedIOCompletedJob job
vmThreadedIOCompletedJob job “ ”
REDIS_IOJOB_LOAD key vm
handleClientsBlockedOnSwappedKey key c-
>db->io_keys server->db->io_keys) {key, c} key
db->io_keys c dontWaitForSwappedKey()
db->io_keys c->io_keys key
key c->io_keys c key
server.io_ready_clients “ ”
server.io_ready_clients
READABLE Redis aeMain()
epoll beforeSleep() Redis
server.io_ready_clients ready READABLE

4.2.
Redis (Blocking Virtual
Memory) (Threaded Virtual Memory IO) vm.c

Redis AOF Snapshot


vm Redis serverCron() vm
/* Swap a few keys on disk if we are over the memory limit and VM
* is enbled. Try to free objects from the free list first. */
if (vmCanSwapOut()) {
while (server.vm_enabled && zmalloc_used_memory() >
server.vm_max_memory)
{
int retval = (server.vm_max_threads == 0) ?
vmSwapOneObjectBlocking() :
vmSwapOneObjectThreaded();
// ...
if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
}
}

23 / 32
Redis · Arch_Platform

vmSwapOneObjectBlocking() vmSwapOneObjectThreaded()
vmSwapOneObject() Redis
5 computeObjectSwappability()
1

REDIS_IOJOB_PREPARE_SWAP job Snapshot

REDIS_IOJOB_DO_SWAP job
Redis vm

REDIS_VM_MAX_RANDOM_JUMP/4
REDIS_VM_MAX_RANDOM_JUMP

5.
Redis ——snapshot aof dump
snapshot Redis dump aof
Redis dump

5.1. Snapshot
Redis Snapshot Redis N
M Snapshot save
bgsave
save Redis bgsave Redis
dump
saveCommand() bgsave
rdbSave()
bgsaveCommand() bgsave
rdbSaveBackground()
rdbSave()
rdbSaveBackground() waitEmptyIOJobsQueue()
vm IO vm IO
fork() server.bgsavechildpid pid
updateDictResizePolicy() Hash
vm vm swap rdbSave() vm
IO vm swap vm IO
int rdbSaveBackground(char *filename) {
pid_t childpid;

if (server.bgsavechildpid != -1) return REDIS_ERR;


if (server.vm_enabled) waitEmptyIOJobsQueue();
server.dirty_before_bgsave = server.dirty;
24 / 32
Redis · Arch_Platform

if ((childpid = fork()) == 0) {
/* Child */
if (server.vm_enabled) vmReopenSwapFile();
if (server.ipfd > 0) close(server.ipfd);
if (server.sofd > 0) close(server.sofd);
if (rdbSave(filename) == REDIS_OK) {
_exit(0);
} else {
_exit(1);
}
} else {
/* Parent */
if (childpid == -1) {
redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
strerror(errno));
return REDIS_ERR;
}
redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
server.bgsavechildpid = childpid;
updateDictResizePolicy();
return REDIS_OK;
}
return REDIS_OK; /* unreached */
}
rdbSave() waitEmptyIOJobsQueue() vm IO
vm IO vm swap vm
IO
tmp db server.db,
server.db+1, ..., server.db+server.dbnum dict
dictIterator db tmp
Redis rdbSaveLen()
32bit Redis 6bit 6bit
14bit 14bit 32bit REDIS_RDB_6BITLEN(00)
REDIS_RDB_14BITLEN(01) REDIS_RDB_32BITLEN(10)
Protobuf

int rdbSaveLen(FILE *fp, uint32_t len) {


unsigned char buf[2];
int nwritten;

if (len < (1<<6)) {


/* Save a 6 bit len */
buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
if (rdbWriteRaw(fp,buf,1) == -1) return -1;
nwritten = 1;
} else if (len < (1<<14)) {
/* Save a 14 bit len */
buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
buf[1] = len&0xFF;
if (rdbWriteRaw(fp,buf,2) == -1) return -1;
nwritten = 2;
} else {
/* Save a 32 bit len */
buf[0] = (REDIS_RDB_32BITLEN<<6);

25 / 32
Redis · Arch_Platform

if (rdbWriteRaw(fp,buf,1) == -1) return -1;


len = htonl(len);
if (rdbWriteRaw(fp,&len,4) == -1) return -1;
nwritten = 1+4;
}
return nwritten;
}
Redis type/key/value
vm
serverCron bgsave bgrewrite
AOF wait3()
backgroundSaveDoneHandler() backgroundRewriteDoneHandler()
snapshot Redis vm LRU
Redis Redis snapshot

5.2. AOF
Snapshot Redis AOF( )
commit log commit log
Redis commit log Redis
Buffer commit log
fsync() fsync
() fsync() fsync()

Redis commit log commit log commit


log bgrewriteaof
Snapshot
AOF
• feedAppendOnlyFile() AOF call()
Redis AOF Redis Keys
Key
call() Redis AOF
(server.dirty feedAppendOnlyFile() AOF
server.aofbuf rewriteaof
server.bgrewritebuf bgrewritebuf
/* Append to the AOF buffer. This will be flushed on disk just before
* of re-entering the event loop, so before the client will get a
* positive reply about the operation performed. */
server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));

/* If a background append only file rewriting is in progress we want to


* accumulate the differences between the child DB and the current one
* in a buffer, so that when the child process will do its work we
* can append the differences to the new append only file. */
if (server.bgrewritechildpid != -1)
server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
26 / 32
Redis · Arch_Platform

• rewriteAppendOnlyFile() aof
rewriteAppendOnlyFile() rewriteAppendOnlyFileBackground()
Redis AOF
(1) fork() rewrite rewriteAppendOnlyFile()
(2) rewrite rewriteAppendOnlyFile()
AOF
(3)rewrite
server.bgrewriteaofbuf
(4) serverCron() wait3()
backgroundRewriteDoneHandler() server.bgrewriteaofbuf
commit log server.aofbuf
(5) rewrite aof

Redis AOF
commit log
rewrite aofbuf
• flushAppendOnlyFile() buffer commit log
Redis beforeSleep() AOF buffer log
• loadAppendOnlyFile() AOF Redis AOF

Redis AOF client()


AOF commit log

6.
Redis http://www.redis.io/topics/
replication Redis

• Master Slave
• Slave Slave Masters Slaves
• Master Slave
Slave
• Slaves
• Slave Master

6.1.
Redis Slave Master Redis
server.replstate Master Slave

27 / 32
Redis · Arch_Platform

REDIS_REPL_NONE server.masterhost server.masterport Master


Master NULL
Slave server.replstate REDIS_REPL_NONE
/* Slave replication state - slave side */
#define REDIS_REPL_NONE 0 /* No active replication */
#define REDIS_REPL_CONNECT 1 /* Must connect to master */
#define REDIS_REPL_TRANSFER 2 /* Receiving .rdb from master */
#define REDIS_REPL_CONNECTED 3 /* Connected to master */
Master Slave redisClient.replstate
Slave Master Master Slave

/* Slave replication state - from the point of view of master


* Note that in SEND_BULK and ONLINE state the slave receives new updates
* in its output queue. In the WAIT_BGSAVE state instead the server is waiting
* to start the next background saving in order to send updates to it. */
#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding
it */
#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB
transmission */
#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
REDIS_REPL_WAIT_BGSAVE_START Master dump
Slave Master dump Slave
REDIS_REPL_WAIT_BGSAVE_END dump Slave
REDIS_REPL_SEND_BULK Slave
REDIS_REPL_ONLINE Master REDIS_REPL_ONLINE Slave

Slave server.replstate!=REDIS_REPL_CONNECTED
Slave Master c.replstate!
=REDIS_REPL_ONLINE
serverCron() 10 replicationCron()
replicationCron() Slave Master

6.1.1. Master
Redis Master Slave Slave
Sync Master Master Master Slave
syncCommand()
sync Slave
sync Slave Master sync
Slave Master sync
Master bgsave bgsave
Slave sync dump Slave replstate
REDIS_REPL_BGSAVE_END Slave replstate
REDIS_REPL_BGSAVE_END dump Slave dump

28 / 32
Redis · Arch_Platform

Slave REDIS_REPL_BGSAVE_START bgsave


bgsave Slave dump
bgsave bgsave Slave
REDIS_REPL_BGSAVE_END
Slave Master server.slaves
void syncCommand(redisClient *c) {
/* ignore SYNC if aleady slave or in monitor mode */
if (c->flags & REDIS_SLAVE) return;

/* Refuse SYNC requests if we are a slave but the link with our master
* is not ok... */
if (server.masterhost && server.replstate != REDIS_REPL_CONNECTED) {
addReplyError(c,"Can't SYNC while not connected with my master");
return;
}

/* SYNC can't be issued when the server has pending data to send to
* the client about already issued commands. We need a fresh reply
* buffer registering the differences between the BGSAVE and the current
* dataset, so that we can copy to other slaves if needed. */
if (listLength(c->reply) != 0) {
addReplyError(c,"SYNC is invalid with pending input");
return;
}

redisLog(REDIS_NOTICE,"Slave ask for synchronization");


/* Here we need to check if there is a background saving operation
* in progress, or if it is required to start one */
if (server.bgsavechildpid != -1) {
/* Ok a background save is in progress. Let's check if it is a good
* one for replication, i.e. if there is another slave that is
* registering differences since the server forked to save */
redisClient *slave;
listNode *ln;
listIter li;

listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
slave = ln->value;
if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
}
if (ln) {
/* Perfect, the server is already registering differences for
* another slave. Set the right state, and copy the buffer. */
listRelease(c->reply);
c->reply = listDup(slave->reply);
c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
} else {
/* No way, we need to wait for the next BGSAVE in order to
* register differences */
c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
}
} else {
/* Ok we don't have a BGSAVE in progress, let's start one */
redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");

29 / 32
Redis · Arch_Platform

if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
addReplyError(c,"Unable to perform background save");
return;
}
c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
}
c->repldbfd = -1;
c->flags |= REDIS_SLAVE;
c->slaveseldb = 0;
listAddNodeTail(server.slaves,c);
return;
}
Snapshot bgsave serverCron()
backgroundSaveDoneHandler()
updateSlavesWaitingBgsave() bgsave Slave
updateSlavesWaitingBgsave() server.slaves
REDIS_REPL_WAIT_BGSAVE_START bgsave dump
REDIS_REPL_WAIT_BGSAVE_START
Slave Slave REDIS_REPL_SEND_BULK Master Slave
WRITEABLE Slave sendBulkToSlave() Slave
Master sendBulkToSlave() socket
Slave WRITEABLE Slave REDIS_REPL_ONLINE
Slave WRITEABLE Slave Slave

6.1.2. Slave
Redis Slave Master Slave
REDIS_REPL_ONLINE replicationCron()
syncWithMaster() Master
syncWithMaster() Master sync
readSyncBulkPayload() Master dump Slave syncRead()
syncWrite() Master IO
Slave
replicationCron() Master dump
dump Slave replicationAbortSyncTransfer()

6.2.
call() Master replicationFeedSlaves()
Slaves Slave Master
serverCron() Master Slave Master
DEL

30 / 32
Redis · Arch_Platform

replicationCron() Master Slave


Slave replicationCron() Ping Slave Master
Slave Ping

6.3.
slaveof Redis Slave slaveof no one
Slave Master Slave Master slaveof ip port
Slave
Redis
server.masterhost

31 / 32
Redis · Arch_Platform

[1] Redis, http://redis.io


[2] Binary-safe, http://en.wikipedia.org/wiki/Binary_safe
[3] malloc_size(), Mac OS X Manual Page, http://
developer.apple.com/library/mac/#documentation/Darwin/Reference/
ManPages/man3/malloc_size.3.html
[4] BeansDB, http://code.google.com/p/beansdb/
[5] Redis -aof(append only file) , http://lgone.com/
html/y2010/757.html
[6] Redis , http://www.petermao.com/category/redis

32 / 32

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy