opendir(DIR,"TWiki");
while($file = readdir(DIR)) {
next if ($file =~ /^\./);
next if ($file !~ /\.txt$/);
open(thisFile, "TWiki/$file") || die "Can't open '$file'\n";
while(<thisFile>) {
s!<[^>]*>! !g;
# replace - and . with spaces, so we catch hyphenated words
s![-.]! !g;
s![^a-z0-9 ]! !ig;
# remove individual character words (e.g. 'a', 'I')
s!\b.\b! !ig;
# remove things that are just numbers
s!\b[0-9]+\b!!ig;
# print;
foreach $word (split) {
if ($word =~ m!^[A-Z]+[a-z]+[A-Z]+[a-zA-Z0-9]*\b!)
{
# print "$word ==>";
$word =~ s!([a-z])([A-Z0-9])!\1 \2!g;
foreach $twikiWord (split(/ /,$word)) {
if ($twikiWord !~ "(METAFILEATTACHMENTname)|(METATOPICINFOauthor)|(METATOPICPARENTname)|(METAFIELDname)|(METAFORMname)") {
$words++;
$wordhash{lc("$twikiWord")}++;
#$wordhash{lc("$twikiWord")}++;
} }
} else {
$words++;
$wordhash{lc($word)}++;
}
}
}
}
print "counted $words words\n";
$uniq_words = keys %wordhash;
printf "%8d Unique words\n", $uniq_words;
foreach $i ( sort { $wordhash{$b} <=> $wordhash{$a} } keys %wordhash ) {
printf "%8d %s\n", $wordhash{$i}, $i;
}
--
MattWalsh - 16 Apr 2003