#------------------------------------------------------- # Author J.F.Craine 1999 mailto:j.f.craine@computer.org # Set TAB to 2 columns if you want to read this! #------------------------------------------------------- $HASHBASE = 1001; # Used in computing hash function $DICTIONARYLINES=83250; # Number of lines in dictionary part of file $DICTIONARYFILE ='sep.txt'; # Name of dictionary file $MAILPROG = '/usr/sbin/sendmail'; # Address of server mail handler $LOGFILE = 'ManxLog.txt'; # name of log file # Enter the email address for who the mail is to and from... looks like it's me! $mail_from = 'j.f.craine@computer.org'; $recipient = 'j.f.craine@embedded-systems.ltd.uk'; # Enter the path to the mail program use Fcntl ':flock'; # import LOCK_* constants # Start by getting parameters. Normally these will be from the form on the web page, # but we can test the script under DOS by parameters in the command line. $PC = open(READ, "PCflag"); # Are we running on the PC or server? if ($PC) { # running on PC; get parameters from command line close(READ); require ("C:/Perl/lib/cgi-lib.pl"); foreach $argc(@ARGV) { # Split into key and value. ($key, $val) = split(/=/,$argc); # splits on the first =. # Associate key and value $in{$key} .= "\0" if (defined($in{$key})); # \0 is the multiple separator $in{$key} .= $val; } } else { # running on server; get parameters from environment require ("/usr/local/lib/perl5/cgi-lib.pl"); require CGI::Apache; &ReadParse; GetOrigin(); # Try to do a reverse DNS lookup to get sender's ID # Output "magic" header that tells things that we have a web page coming out: print "Content-Type: text/html\n\n"; } $word = $in{'word'}; $any = ($in{'any'} eq "1") ? "1" : "0"; $phrases = ($in{'phrases'} eq "1") ? "1" : "0"; $E2M = ($in{'E2M'} eq "1") ? "1" : "0"; $mutate = ($in{'mutate'} eq "1") ? "1" : "0"; if ($word =~ /\+/) { $mutate = "0"; } # Only mutate single words $word =~ s/\+/ /g; if ($E2M) { $mutate = "0"; $dictionary = "engldict.txt"; $lineFile = "compress.txt"; processLine($word,$any); } else { $dictionary = "manxdict.txt"; $lineFile = "faastey.txt"; } if ($mutate == 1) { getMutations($word,$any); } else { processLine($word,$any); } logRequest(); # Log ths access #--------------------------- # Process the text in this line, # changing CHs to QHs in a bid to handle ccedilla. # Display a list of the words found. sub processLine() { my $Line; ($Line,$any) = @_; showLinesFound($Line, qh2Ch($Line), $any); } sub getMutations() { my $word, @prefixes, $mutated, $line_List, $targetList, $baseWord, $any; %demutations = ('^a'=>'fa', '^b'=>'p', '^d'=>'t', '^e'=>'fe', '^g'=>'c|k', '^h'=>'ch|s|sh|t|th|', '^i'=>'fi', '^j'=>'ch', '^l'=>'sl', '^m'=>'b', '^n'=>'d|dh|g','^o'=>'fo', '^r'=>'fr', '^t'=>'s', '^u'=>'fu', '^v'=>'b|f|m', '^w'=>'bw|mw|b|m', '^y'=>'j', '^ch'=>'c|k|sh', '^cl'=>'sl', '^dh'=>'th', '^gh'=>'d|dh|g', '^gu'=>'qu', '^na'=>'a', '^ne'=>'e', '^ng'=>'d|gu', '^ni'=>'i', '^no'=>'o', '^nu'=>'u', '^ny'=>'j', '^ph'=>'p', '^tr'=>'str', '^wh'=>'qu', '^ngh'=>'d', '^qh' => 'sh'); ($baseWord,$any) = @_; $word = $baseWord; # Get basic word # $word =~ s/^qh/ch/; # Make sure that we pick up mutation at start $line_List = findLines($word); # Start with no lines in list $targetList = ""; # No candidate mutations yet foreach $mutated (sort keys(%demutations)) { # For each possible prefix get demutations @letters = split(/\|/, $demutations{$mutated}); # and split into a list $word = $baseWord; # Get basic word if ($word =~ $mutated) { # If word starts with this sequence foreach $demutator (@letters) { # For each possible prefix... $demutatedWord = $word; $demutatedWord =~ s/$mutated/$demutator/; # try to undo mutation $newList = qh2Ch($demutatedWord); $targetList = $demutatedWord . "&" . $targetList; # put candidate in list of possible mutations $line_List = orSets($line_List,$newList); } #foreach } } #foreach showLinesFound($targetList, $line_List, $any); # and then look up the result in the dictionary } # Handle cedilla - assume that there can be up to 3 on a line sub qh2Ch() { my $Line, $lineList; ($Line) = @_; $lineList = findLines($Line); if ($E2M) { return $lineList; } # Ugly escape if translating from English @dummy = split(/ch/i,$Line); if ($#dummy == 1) { $lineList = orSets($lineList,findLines(@dummy[0] . "qh" . @dummy[1])); } else { if ($#dummy == 2) { $lineList = orSets($lineList,findLines(@dummy[0] . "ch" . @dummy[1] . "qh" . @dummy[2])); $lineList = orSets($lineList,findLines(@dummy[0] . "qh" . @dummy[1] . "ch" . @dummy[2])); $lineList = orSets($lineList,findLines(@dummy[0] . "qh" . @dummy[1] . "qh" . @dummy[2])); } else { if ($#dummy == 3) { # Not sure whether we ever get three CHs in a line that often! $lineList = orSets($lineList,findLines(@dummy[0] . "ch" . @dummy[1] . "qh" . @dummy[2] . "ch" . @dummy[3])); $lineList = orSets($lineList,findLines(@dummy[0] . "qh" . @dummy[1] . "ch" . @dummy[2] . "ch" . @dummy[3])); $lineList = orSets($lineList,findLines(@dummy[0] . "qh" . @dummy[1] . "qh" . @dummy[2] . "ch" . @dummy[3])); $lineList = orSets($lineList,findLines(@dummy[0] . "ch" . @dummy[1] . "ch" . @dummy[2] . "qh" . @dummy[3])); $lineList = orSets($lineList,findLines(@dummy[0] . "ch" . @dummy[1] . "qh" . @dummy[2] . "qh" . @dummy[3])); $lineList = orSets($lineList,findLines(@dummy[0] . "qh" . @dummy[1] . "ch" . @dummy[2] . "qh" . @dummy[3])); $lineList = orSets($lineList,findLines(@dummy[0] . "qh" . @dummy[1] . "qh" . @dummy[2] . "qh" . @dummy[3])); } } } return $lineList; } /*--------------------------- # Find matches for the text in this line, # splitting the line into words # remove punctuation # handle prefixes such as "aa-" # put everything into lower case # If the computed index is within the limits specified, # put line number into incidence table. # Return the incidence table */ function findLines() { local $b,@words,$index,$Line,$linesread,@indices,@dictionarylines; ($Line) = @_; # Get a list containing the hash code of each word on the line and # sort it so that we can do a sequential search in the next phase: @dictionarylines = (); @indices = (); $Line =~ s/ h / /; # Get rid of separated 'h's. $Line =~ s/[\(\),;!\?-]/ /g; # Remove punctuation and replace with white space $Line =~ s/ / /g; # Replace multiple spaces with single $Line =~ tr/A-Z/a-z/; # Convert everything to lower case $Line =~ s/&cced;h/qh/g; # Handle c cedille @words = split(/[\s]/,$Line); # Split up result into component words foreach $b(@words) { # Append number of this line to record. if ($b ne "") { # Don't want null strings $index = getHash($b); # Look up index push(@indices, sprintf("%6d %s",$index, $b)); # add it to set of indices } } @indices = sort {$a <=> $b} (@indices); # now sort the list into ascending order ready for next phase. # For each word and hash code, read the line corresponding to the # hash code and find out the position of the word that we're looking # up on that line of the file: open(READ, $dictionary); # Open the file $linesread=0; # and count lines from 0 foreach $entry(@indices) { # For each word that we're looking up @items = split(/ +/,$entry); # get the word and its hash code $word = pop(@items); $index = pop(@items); while ($index >= $linesread) { # Go to the appropriate line $linesread++; # keeping track of line numbers $Line = ; } # Work out the index number for that word $offset = getIndexInRecord($word, $Line, $index); if ($offset >= 0) { # Found the word - add its index to set of indices. # Use leading zeroes to facilitate sorting later on. push(@dictionarylines, sprintf("%07d",$offset)); } } close(READ); # Now we have a list of index numbers for the words in the line to be analysed. # Sort index numbers into ascending order so that we can read the line numbers # file sequentially. @dictionarylines = sort(@dictionarylines); # For each index number there is a list of line numbers in which the # corresponding word is to be found in the dictionary. Now that they're in # ascending order we can search the file and pick the lines that we want. @dictionaryentries = (); # Null list open(READ, $lineFile); # Open list of lines $linesread=0; # Zero line count foreach $entry(@dictionarylines) { while ($entry >= $linesread) { # Scan for next line $linesread++; $Line = ; } # Found it - convert data on line from compressed form and add it to set of indices push(@dictionaryentries, decompress($Line)); } close(READ); # If there is more than one word in a line, we'll have a list for each word # showing the lines where it is to be found. We want to find an entry in which # ALL the words appear. sort(@dictionaryentries); if ($#dictionaryentries > 0) { # More than one word in line $lines = shift(@dictionaryentries); foreach $entry(@dictionaryentries) { $lines = andSets($lines,$entry); } } else { $lines = shift(@dictionaryentries); } return $lines } # Convert packed record to a list of numbers @numbers # Record is concatenation of 3-character groups # Each character x in group has ASCII code 48<=x<=111 sub decompress { my $in, $out, $num, $i; ($in) = @_; $out = ""; while (length($in)>2) { for ($num=0,$i=0; $i<3; $i++) { $num = 64 * $num + (ord $in) - 48; $in =~ s/^.//; } $out .= " " . $num; } return $out } #--------------------------- # Call with three parameters: # (1) List of line numbers to get # (2) List of possible target words if looking for mutations # (3) 1 => looking for exact match, 0 => any match will do sub showLinesFound() { my $lines, $targets, @targets, @halves, $target, $Manx; ($targets, $lines, $matchAny) = @_; @targets = split(/\&/,$targets); @linesRead = (); @sorted = (" "); # The string $lines is a list of the lines which contain entries matching # the word that we are looking up. Look up the line corresponding to each entry. # Read the file sequentially, and get each entry. @lines = split(/ +/,$lines); # Split the string up into a list sort(@lines); # Sort it so that we can do a sequential read if ($#lines) { # if there are any entries to look up open(READ, $DICTIONARYFILE); $Line = ; $linesread=1; foreach $index(@lines) { if ($index ne "" && $index != -1) { while ($index > $linesread) { $Line = ; $linesread++; } if ($linesread <= $DICTIONARYLINES || $phrases) { chomp($Line); if ($matchAny) { push(@linesRead, $Line); } else { $Manx = $Line; $Manx =~ s/.*#//; # Get the Manx half of entry $Manx =~ s/[-`]//; foreach $target(@targets) { $target =~ s/[-`]//; if ($Manx eq $target) { push(@linesRead, $Line); } } } } } } close(READ); foreach $Line(@linesRead) { $Line =~ s/c\|h/chq/g; $Line =~ s/C\|h/Chq/g; $Line =~ s/#/\t/; if (!$E2M) { @halves = split(/\t/,$Line); $Line = sprintf("%s\t%s", @halves[1], @halves[0]); } } @sorted = sort {uc($a) cmp uc($b)} @linesRead; @linesRead=(); foreach $Line(@sorted) { if ($Line =~ /^.*[- ].*\t/) { push(@linesRead, $Line); } else { unshift(@linesRead, $Line); } } print "\n\n

"; if ($E2M) { printf("\n
ENGLISHMANX"); } else { printf("\n\n\n
MANXENGLISH"); } foreach $Line(@linesRead) { $Line =~ s/chq/\çh/g; # Change chq to çh $Line =~ s/Chq/\Çh/g; # Change Chq to Çh $Line =~ s/`//g; # Remove ` characters # Format to make a line of HTML table @halves = split(/\t/,$Line); $latest = @halves[0]; if ($previous eq $latest) { printf(", %s", @halves[1]); } else { printf("
%s%s", @halves[0], @halves[1]); } $previous = $latest; } printf("
\n"); } else { print "Sorry, no matching entry was found!\n"; } print "\n"; } #--------------------------- # Call with two strings containing space delimited numbers # Returns a string containing numbers that appear in either list. sub orSets() { my $a,$b,$c; ($a,$b) = @_; $c = $a . " " . $b; return uniqSet($c); } #--------------------------- # Call with a string containing space delimited numbers # Returns string with same numbers sorted in ascending number # and duplicate entries deleted. sub uniqSet() { local $a,$b; my $c,$i,@a; ($a) = @_; @a = split(/ +/, $a); @a = sort {$a <=> $b} (@a); $c = @a[0]; $b = $c; for ($i=1; $i<=$#a; $i++) { if (@a[$i] != $c) { $b = $b . " " . @a[$i]; } $c = @a[$i]; } return $b; } #--------------------------- # Call with two strings containing space delimited numbers # Returns a string containing those numbers that appear in both lists. sub andSets() { my $c,$i,$j,@a,@b; local $a,$b; ($a,$b) = @_; @a = sort {$a <=> $b} (split(/ +/,$a)); @b = sort {$a <=> $b} (split(/ +/,$b)); for ($c = '', $i=0; $i<=$#a; $i++) { for ($j=0; $j<=$#b; $j++) { if (@a[$i]==@b[$j]) { $c = $c . " " . @a[$i]; } } } return $c; } #--------------------------- # getIndexInRecord($word, $Line, $index) # Work out number corresponding to this word by counting # along the current line until a match is found. # Return the number if a match is found, -1 if not. sub getIndexInRecord { my $word, $Line, @entries, $entry, $i; ($word, $Line, $hash) = @_; @entries = split(/-/,$Line); # Split up to get rid of leading number $Line = shift(@entries); $Line = shift(@entries); @entries = split(/ /,$Line); # Split up words in record # Scan entries on line looking for a match. # If found, derive return code from position of word $i=0; foreach $entry(@entries) { if ($word eq $entry) { return $i+$hash; } $i += $HASHBASE; } return -1; } #--------------------------- # $hashcode = gethash($word) # Return a hash code for this word (0 <= $hash <= 1000) # This hash function mustn't be changed as it is the same # one used to computer tables! sub getHash { my $hash; my ($b) = @_; while ($b ne "") { $hash = $hash*17 + ord($b); # 17 and 1001 co-prime $hash %= 1001; $b =~ s/^.//; } return $hash; } sub logRequest() { $now = localtime(); # Cut off time and year to give day and date. # In theory this means that we could alias # every six years or so, but honestly... $today = substr($now,0,10); # Lock the log file and read it to see if it contains old entries. # If there are entries from previous days, read the whole file. # We shouldn't need to write back any of today's data except our # query because this action should only be carried out once per day. flock(LOG,LOCK_EX); open(LOG, $LOGFILE); # $then = ; # $then = substr($then,0,10); $then = substr(,0,10); $doUpdate = $today ne $then; close(LOG); if ($doUpdate) { # OK - read current file contents to @lines open(LOG, $LOGFILE); @lines = ; close(LOG); open(LOG, ">$LOGFILE"); # and reopen it for writing } else { open(LOG, ">>$LOGFILE"); # otherwise open it for appending } # Now either write as the first record of a new day's log file or # append a record to an existing file print LOG "$now,$E2M,$any,$mutate,$phrases,$in{'word'},"; print LOG "$ENV{'REMOTE_ADDR'},"; print LOG "$ENV{'HTTP_USER_AGENT'},"; print LOG "$ORIGIN\n"; # Finished working with the log file so close and unlock it for anybody else to play with close(LOG); flock(LOG,LOCK_UN); if ($doUpdate) { # A new day, so set about constructing an email if ($PC) { # running on PC; send "mail" to dump file open (MAIL, ">>maildump.txt"); } else { # running on server - do a real mailing open (MAIL, "|$MAILPROG $recipient"); } print MAIL "Subject: Manx dictionary Log\n"; print MAIL "To: $recipient\n"; print MAIL "Reply-to: $mail_from\n"; print MAIL "From: Manx Dictionary\n\n"; chomp @lines; foreach $record(@lines) { print MAIL "$record\n"; } close (MAIL); } }