Adrian,
here are some tests on our access.log.
[root@proxy1 logs]# cat access.log | awk '{ print $7 }' > /tmp/crap.txt
[root@proxy1 logs]# cat /tmp/crap.txt | sort | uniq > /tmp/crap-sorted.txt
[root@proxy1 logs]# wc /tmp/crap-sorted.txt
347722 347722 18528052 /tmp/crap-sorted.txt
[root@proxy1 logs]# cat /tmp/crap-sorted.txt | /tmp/md5 > /tmp/crap-sorted-md5.txt
[root@proxy1 logs]# time cat /tmp/crap-sorted.txt | /tmp/md5 > /tmp/crap-sorted-md5.txt
0.00user 0.15system 0:09.24elapsed 1%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (83major+10minor)pagefaults 0swaps
! the md5 program is md5sum rewritten to produce a hash for each line.
[root@proxy1 logs]# wc /tmp/crap-sorted-md5.txt
347723 347722 11474827 /tmp/crap-sorted-md5.txt
[root@proxy1 logs]# cat /tmp/crap-sorted-md5.txt | sort | uniq | wc
347723 347722 11474827
[root@proxy1 logs]# cat /tmp/crap-sorted-md5.txt | cut -c -10 | sort | uniq | wc
347723 347722 3824943
[root@proxy1 logs]# cat /tmp/crap-sorted-md5.txt | cut -c -9 | sort | uniq | wc
347722 347721 3477211
[root@proxy1 logs]# cat /tmp/crap-sorted-md5.txt | cut -c -8 | sort | uniq | wc
347711 347710 3129391
[root@proxy1 logs]# cat /tmp/crap-sorted-md5.txt | cut -c -7 | sort | uniq | wc
347483 347482 2779857
[root@proxy1 logs]# cat /tmp/crap-sorted-md5.txt | cut -c -6 | sort | uniq | wc
344111 344110 2408771
>
> Does anyone have figures for collisions of URL names when md5'ed ?
> I'm curious to know what it is like in the real world ..
>
> Thanks,
>
> Adrian
>
> --
> Adrian Chadd
> <adrian@creative.net.au>
>
>
-- Stephen Baxter CCNA SE Network Access/Big Networks Australia CHECK OUT OZBYTES http://www.ozbytes.net.au Sound Bytes - 50 artists hosted and growing phone : +61 8 8221 5221 222 Grote Street fax : +61 8 8221 5220 Adelaide 5000, AustraliaReceived on Tue Jul 29 2003 - 13:15:59 MDT
This archive was generated by hypermail pre-2.1.9 : Tue Dec 09 2003 - 16:12:16 MST