Hi Alex.
I have been looking into this, but I have some trouble finding digests
matching your statistics. I get a much higher rate of single bit changes
(more in line with what I expected). Maybe the digests I have collected
are a-typical or your bit count is busted or something odd is going on
here which I can't explain..
Extreme example taken from bo2.us.ircache.net with several hours in
between the digests (31% of the bytes changed):
-- Basic information --
Size: 861667 bytes / 6893336 bits
Chaged bytes: 273016 (31%)
Changed bits: 320481 (4%)
-- Changed bits / byte --
0: 588651 68%/total
1: 229925 84%/diff 26%/total
2: 38965 14%/diff 4%/total
3: 3894 1%/diff 0%/total
4: 216 0%/diff 0%/total
5: 16 0%/diff 0%/total
6: 0 0%/diff 0%/total
7: 0 0%/diff 0%/total
8: 0 0%/diff 0%/total
Attached is a program to print fairly detailed statistics about digest
changes. Can you please run this on what you consider a couple of
typical digest updates and send me the result. The program assumes MIME
input (HTTP headers + body), if you have raw digests without HTTP
headers then remove the call to skipHttpHeader..
The goal you have set up is certainly within reach, but the algorithm
may have to differ depending on the change rate.. The main approach I am
leaning towards are huffman encoded changed bit offset with some
variations to handle higher bit change rates.
/Henrik
Alex Rousskov wrote:
>
> The best I can do is to give you the stats from our caches. Here is the
> percentage of changed _bytes_ between two consecutive digests from ten
> caches: (4.95%, 3.99, 2.83, 3.08, 5.49, 4.18, 3.41, 1.65, 7.24, 4.45%).
>
> I do not have bit stats, but here is a "typical" distribution of the number
> of changed bits in a _changed_ byte (1 means 1 bit has changed, 7 means 7
> bit have changed):
>
> #bits count percent
> 1 12605 ( 29.40%)
> 2 10201 ( 23.79%)
> 3 11856 ( 27.65%)
> 4 5903 ( 13.77%)
> 5 1916 ( 4.47%)
> 6 352 ( 0.82%)
> 7 43 ( 0.10%)
[ Part 2: "Attached Text" ]
/* cc -O -o cd_diff cd_diff.c */
#include <fcntl.h>
#include <stdio.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <unistd.h>
#include <assert.h>
#include <stdlib.h>
#include <sys/times.h>
typedef struct {
int size;
unsigned char *data;
} DataBuffer;
#define MIN(a,b) (a<b?a:b)
#define PERCENT(n,total) (((n)*100+50)/(total))
#define BTEST(b,buf) ((buf.data[(b)/8]&(1<<((b)&7)))!=0)
DataBuffer readFile(int fd)
{
DataBuffer buf;
int startpos = lseek(fd, 0, SEEK_CUR);
int endpos = lseek(fd, 0, SEEK_END);
int size = endpos - startpos;
buf.size = size;
buf.data = malloc(size);
assert(buf.data);
lseek(fd, startpos, SEEK_SET);
if (read(fd, buf.data, size) != size) {
perror("Unexpected read size");
exit(1);
}
return buf;
}
static int bits_in_byte[256];
void init_bitcount()
{
int i, j;
for (i=0; i<256; i++) {
bits_in_byte[i] = 0;
for( j=i ; j != 0 ; j = j>>1) {
if (j & 1) bits_in_byte[i] += 1;
}
}
}
long bitcount(DataBuffer buf)
{
int i;
long bits = 0;
for (i=0 ; i <buf.size; i++) {
bits += bits_in_byte[buf.data[i]];
}
return bits;
}
long bitblockcount(DataBuffer buf)
{
int i;
int lastbit = 0;
int bit;
long blocks = 0;
for (i=0 ; i <buf.size * 8; i++) {
bit = BTEST(i,buf);
if (bit && !lastbit)
blocks++;
lastbit = bit;
}
return blocks;
}
long bytecount(DataBuffer buf)
{
int i;
long bytes = 0;
for (i=0 ; i <buf.size; i++) {
if (buf.data[i]) bytes += 1;
}
return bytes;
}
void histBytes(DataBuffer buf) {
int i,j;
int bytes = bytecount(buf);
int bytecount[256];
int bitcount[9];
for(i=0; i<9;i++)
bitcount[i]=0;
for(i=0; i<256;i++)
bytecount[i]=0;
for(i=0; i<buf.size; i++) {
bytecount[buf.data[i]]+=1;
bitcount[bits_in_byte[buf.data[i]]]+=1;
}
printf("-- Changed bits/byte --\n");
printf("%d: %8d %2d%%/total\n", 0, bitcount[0], PERCENT(bitcount[0],buf.size) );
for (i=1; i<9; i++)
printf("%d: %8d %2d%%/diff %2d%%/total\n", i, bitcount[i], PERCENT(bitcount[i],bytes), PERCENT(bitcount[i],buf.size) );
printf("-- Byte differences (xor) --\n");
printf("0x%02x(0): %8d %2d%%/total\n", 0, bytecount[0], PERCENT(bytecount[0],buf.size) );
for(j=1;j<=8;j++)
for (i=1; i<256; i++)
if (bits_in_byte[i]==j)
printf("0x%02x(%d): %8d %2d%%/diff %2d%%/total\n", i, j, bytecount[i], PERCENT(bytecount[i],bytes), PERCENT(bytecount[i],buf.size) );
}
void histBits(DataBuffer buf) {
int i;
int len=0;
int lastbit=-1;
int bitlen[65536];
int bitdist[65536];
int bits = bitcount(buf);
int blocks = bitcount(buf);
for(i=0; i<65536;i++) {
bitlen[i]=0;
bitdist[i]=0;
}
for(i=0; i< 8*buf.size; i++) {
int bit = BTEST(i,buf);
if (bit && !lastbit)
bitdist[MIN(len,65536)]+=1;
else if (bit && lastbit)
bitdist[0]+=1;
else if (!bit && lastbit==1)
bitlen[MIN(len,65536)]+=1;
if ( bit != lastbit ) {
lastbit = bit;
len = 1;
} else {
len++;
}
}
printf("-- Bit block length --\n");
for (i=0; i<65536; i++)
if (bitlen[i])
printf("0x%04x: %6d %2d%%/blocks %2d%%/bits\n", i, bitlen[i], PERCENT(bitlen[i],blocks), PERCENT(bitlen[i]*i,bits));
printf("-- Bit distance --\n");
for (i=0; i<65536; i++)
if (bitdist[i])
printf("0x%04x: %6d %2d%%/blocks %2d%%/bits\n", i, bitdist[i], PERCENT(bitdist[i],blocks), PERCENT(bitdist[i],bits));
}
void diff(int fd1, int fd2) {
DataBuffer buf1, buf2;
DataBuffer xor;
int i,j;
init_bitcount();
buf1 = readFile(fd1);
buf2 = readFile(fd2);
assert(buf1.size == buf2.size);
/* Build a XOR of the two digests */
xor.size = buf1.size;
xor.data = malloc(xor.size);
assert(xor.data);
for(i=0; i<buf1.size ; i++) {
xor.data[i] = buf1.data[i] ^ buf2.data[i];
}
printf("-- Basic information --\n");
printf("Size: %d bytes / %d bits\n", xor.size, xor.size*8);
i = bytecount(xor);
printf("Chaged bytes: %d (%d%%)\n", i, PERCENT(i,xor.size));
i = bitcount(xor);
printf("Changed bits: %d (%d%%)\n", i, PERCENT(i,xor.size*8));
j = i;
i = bitblockcount(xor);
printf("Blocks of changed bits: %d (%d%% multibit)\n", i, PERCENT(j-i,j));
histBytes(xor);
histBits(xor);
}
static
int Open(const char *fname) {
int fd = open(fname, O_RDONLY);
if (fd < 0) {
perror(fname);
exit(errno);
}
return fd;
}
void skipHttpHeader(int fd)
{
char c, n;
n = 0;
while(n < 2 && read(fd, &c, 1) > 0) {
switch(c) {
case '\r':
break;
case '\n':
n += 1;
break;
default:
n = 0;
}
}
}
int main(int argc, char *argv[]) {
int fd1, fd2;
if (argc != 3) {
fprintf(stderr, "usage: %s <digest_old> <digest_new>\n", argv[0]);
return -1;
}
fd1 = Open(argv[1]);
skipHttpHeader(fd1);
fd2 = Open(argv[2]);
skipHttpHeader(fd2);
diff(fd1, fd2);
close(fd1);
close(fd2);
return 0;
}
Received on Tue Jul 29 2003 - 13:15:59 MDT
This archive was generated by hypermail pre-2.1.9 : Tue Dec 09 2003 - 16:12:16 MST