關於純真ip數據庫的格式。本文參考了Luma大蝦的文章和LumaQQ的IPseeker類的代碼,用c語言編寫。本來是想寫個專門讀純真ip數據庫的類的,所以本文的代碼本來是個測試代碼,後來懶了就沒繼續做下去,所以代碼看上去比較亂,嘿嘿。
//file:getlist.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <assert.h>
#include <string.h>
#include <arpa/.net.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#define IP_DBPATH "/usr/local/LumaQQ/QQWry.dat"
#define HEAD_SIZE 8
char* record_mmap;
#define IP_QUAD_FORMAT_STRING "%03u.%03u.%03u.%03u"
#define IP_QUAD_LE(x) \
*((unsigned char*)x + 3),\
*((unsigned char*)x + 2),\
*((unsigned char*)x + 1),\
*((unsigned char*)x + 0)
void oops(const char* msg)
{
perror(msg);
exit(1);
}
//Proto : unsigned long int_of_4byte_le(const char* data)
//Function: read 4 byte from data, convert it to unsigned
// integer in little endian order.
//Argument: data in little endian(LE) order
//Return : value of 4 byte data in LE
unsigned long int_of_4byte_LE(const char* data)
{
const unsigned char* p = (const unsigned char*)data;
return ((p[0]) +
(p[1] << 8) +
(p[2] << 16) +
(p[3] << 24)
);
}
unsigned long int_of_3byte_LE(const char* data)
{
const unsigned char* p = (const unsigned char*)data;
return ((p[0]) +
(p[1] << 8) +
(p[2] << 16)
);
}
void read_record(off_t offset)
{
const char* p = record_mmap;
const char* country=NULL;
const char* territory=NULL;
printf(IP_QUAD_FORMAT_STRING,IP_QUAD_LE(p+offset));
p += offset + 4;
do{
if(p[0] == 0x01){
p = record_mmap + int_of_3byte_LE(p + 1);
continue;
}
if(p[0] == 0x02){
country = record_mmap + int_of_3byte_LE(p + 1);
p+=4;
while(p[0] == 0x01 || p[0] == 0x02){
p = record_mmap + int_of_3byte_LE(p + 1);
}
territory = p;
break;
}
country = p;
p = strchr(p,0) + 1;
while(p[0] == 0x01 || p[0] == 0x02){
p = record_mmap + int_of_3byte_LE(p + 1);
}
territory = p;
}while(!country || !territory);
printf("\t%s\t%s\n",country,territory);
}
void iterate_index(int db)
{
char record[7];
int n;
int offset;
do{
n = read(db,record,7);
if(n == 0)break;
assert(n == 7);
offset = int_of_3byte_LE(record+4);
/* printf(IP_QUAD_FORMAT_STRING"@%#010x->",
IP_QUAD_LE(record),
offset
);*/
read_record(offset);
}while(1);
}
int main(int argc, char** argv)
{
char buf[HEAD_SIZE];
unsigned int index_beg,index_end;
int db;
if( (db = open(IP_DBPATH,O_RDONLY)) == -1 )oops("opendb error!");
if(read(db,buf,HEAD_SIZE) != HEAD_SIZE)oops("read data header error!");
index_beg=int_of_4byte_LE(buf);
index_end=int_of_4byte_LE(buf+4);
assert((index_end - index_beg)%7 == 0);
//printf("index start from:%#08X\n"
// " end at:%#08X\n",
// index_beg,index_end);
record_mmap = mmap(NULL,index_beg,PROT_READ,MAP_SHARED,db,0);
if(record_mmap == (void *) -1)oops("mmap error");
lseek(db,index_beg - 8,SEEK_CUR);
// printf("indexes:\n");
iterate_index(db);
}
使用麼就執行
./getlist > chunzhenip.GB
./getlist | iconv -f GB -t UTF-8 > chunzhengip.UTF-8
格式是
ip段\t一級地名\t二級地名\n
然後就可以用awk/sed/perl這些文本分析工具來分析了。