#-------------------------------------------------------
# Author J.F.Craine 1999 mailto:j.f.craine@computer.org
# Set TAB to 2 columns if you want to read this!
#-------------------------------------------------------

	$HASHBASE = 1001;						# Used in computing hash function
	$DICTIONARYLINES=83250;			# Number of lines in dictionary part of file
	$DICTIONARYFILE ='sep.txt'; # Name of dictionary file
	$MAILPROG = '/usr/sbin/sendmail';  # Address of server mail handler  
	$LOGFILE  = 'ManxLog.txt';	# name of log file
	# Enter the email address for who the mail is to and from... looks like it's me!
	$mail_from = 'j.f.craine@computer.org';
  $recipient = 'j.f.craine@embedded-systems.ltd.uk';
  # Enter the path to the mail program
	use Fcntl ':flock';					# import LOCK_* constants

# Start by getting parameters. Normally these will be from the form on the web page,
# but we can test the script under DOS by parameters in the command line.
  $PC = open(READ, "PCflag");	# Are we running on the PC or server?
  if ($PC) {									# running on PC; get parameters from command line
  	close(READ);
  	require ("C:/Perl/lib/cgi-lib.pl");
		foreach $argc(@ARGV) {				# 	Split into key and value.  
    	($key, $val) = split(/=/,$argc); # splits on the first =.

# Associate key and value
    	$in{$key} .= "\0" if (defined($in{$key})); # \0 is the multiple separator
    	$in{$key} .= $val;
	  }
  }	
  else {								# running on server; get parameters from environment
  	require ("/usr/local/lib/perl5/cgi-lib.pl");
		require CGI::Apache;
  	&ReadParse;
		GetOrigin();				# Try to do a reverse DNS lookup to get sender's ID
#	Output "magic" header that tells things that we have a web page coming out:
  	print "Content-Type: text/html\n\n";
	}
 	$word    =  $in{'word'};
	$any     = ($in{'any'}     eq "1") ? "1" : "0";
	$phrases = ($in{'phrases'} eq "1") ? "1" : "0";
	$E2M     = ($in{'E2M'}     eq "1") ? "1" : "0";
	$mutate  = ($in{'mutate'}  eq "1") ? "1" : "0";
	if ($word =~ /\+/) { $mutate = "0"; } # Only mutate single words
	$word =~ s/\+/ /g;

	if ($E2M) {
		$mutate = "0";
		$dictionary = "engldict.txt";
		$lineFile = "compress.txt";
		processLine($word,$any);
	} else {
		$dictionary = "manxdict.txt";
		$lineFile = "faastey.txt";
  }
 	if ($mutate == 1) { getMutations($word,$any); }
	else       				{ processLine($word,$any); }
  logRequest();			# Log ths access
  
#---------------------------
#	Process the text in this line,
#				changing CHs to QHs in a bid to handle ccedilla.
#	Display a list of the words found.

sub processLine()
{
	my $Line;
	($Line,$any) = @_;
	showLinesFound($Line, qh2Ch($Line), $any);
}

sub getMutations()
{
	my $word, @prefixes, $mutated, $line_List, $targetList, $baseWord, $any;
  %demutations = ('^a'=>'fa',  '^b'=>'p',     '^d'=>'t',          '^e'=>'fe',			'^g'=>'c|k',
  						 '^h'=>'ch|s|sh|t|th|',         '^i'=>'fi',         '^j'=>'ch',			'^l'=>'sl',
  						 '^m'=>'b',			 '^n'=>'d|dh|g','^o'=>'fo', 				'^r'=>'fr',			'^t'=>'s',
               '^u'=>'fu',     '^v'=>'b|f|m', '^w'=>'bw|mw|b|m',  '^y'=>'j',			'^ch'=>'c|k|sh',
               '^cl'=>'sl',  	 '^dh'=>'th',   '^gh'=>'d|dh|g', 		'^gu'=>'qu',		'^na'=>'a',
               '^ne'=>'e',     '^ng'=>'d|gu', '^ni'=>'i',   			'^no'=>'o',			'^nu'=>'u',
               '^ny'=>'j',		 '^ph'=>'p',    '^tr'=>'str', 			'^wh'=>'qu',		'^ngh'=>'d',
               '^qh' => 'sh');
	
	($baseWord,$any) = @_;
	$word = $baseWord;																# Get basic word															
#	$word =~ s/^qh/ch/;																# Make sure that we pick up mutation at start
	$line_List = findLines($word);										# Start with no lines in list
	$targetList = "";																	# No candidate mutations yet
	foreach $mutated (sort keys(%demutations)) {			# For each possible prefix get demutations
		@letters = split(/\|/, $demutations{$mutated});	# and split into a list
		$word = $baseWord;															# Get basic word															
		if ($word =~ $mutated) {												# If word starts with this sequence
			foreach $demutator (@letters) {								# For each possible prefix...
				$demutatedWord = $word;
				$demutatedWord =~ s/$mutated/$demutator/;		# try to undo mutation
				$newList = qh2Ch($demutatedWord);
				$targetList = $demutatedWord . "&" . $targetList;	# put candidate in list of possible mutations
				$line_List = orSets($line_List,$newList);
			}	#foreach
		}   
	}     #foreach
	showLinesFound($targetList, $line_List, $any);		# and then look up the result in the dictionary
}

# Handle cedilla - assume that there can be up to 3 on a line
sub qh2Ch()
{
	my $Line, $lineList;
	($Line) = @_;
	$lineList = findLines($Line);
	if ($E2M) { return $lineList; }  # Ugly escape if translating from English
	@dummy = split(/ch/i,$Line);
	if ($#dummy == 1) {
		$lineList = orSets($lineList,findLines(@dummy[0] . "qh" . @dummy[1]));
	}
	else {
		if ($#dummy == 2) {
			$lineList = orSets($lineList,findLines(@dummy[0] . "ch" . @dummy[1] . "qh" . @dummy[2]));
			$lineList = orSets($lineList,findLines(@dummy[0] . "qh" . @dummy[1] . "ch" . @dummy[2]));
			$lineList = orSets($lineList,findLines(@dummy[0] . "qh" . @dummy[1] . "qh" . @dummy[2]));
		}
		else {
			if ($#dummy == 3) { # Not sure whether we ever get three CHs in a line that often!
				$lineList = orSets($lineList,findLines(@dummy[0] . "ch" . @dummy[1] . "qh" . @dummy[2] . "ch" . @dummy[3]));
				$lineList = orSets($lineList,findLines(@dummy[0] . "qh" . @dummy[1] . "ch" . @dummy[2] . "ch" . @dummy[3]));
				$lineList = orSets($lineList,findLines(@dummy[0] . "qh" . @dummy[1] . "qh" . @dummy[2] . "ch" . @dummy[3]));
				$lineList = orSets($lineList,findLines(@dummy[0] . "ch" . @dummy[1] . "ch" . @dummy[2] . "qh" . @dummy[3]));
				$lineList = orSets($lineList,findLines(@dummy[0] . "ch" . @dummy[1] . "qh" . @dummy[2] . "qh" . @dummy[3]));
				$lineList = orSets($lineList,findLines(@dummy[0] . "qh" . @dummy[1] . "ch" . @dummy[2] . "qh" . @dummy[3]));
				$lineList = orSets($lineList,findLines(@dummy[0] . "qh" . @dummy[1] . "qh" . @dummy[2] . "qh" . @dummy[3]));
			}
		}
	}
	return $lineList;
}

/*---------------------------
#	Find matches for the text in this line,
#				splitting the line into words
#				remove punctuation
#				handle prefixes such as "aa-"
#				put everything into lower case
#	If the computed index is within the limits specified,
#				put line number into incidence table.
# Return the incidence table */

function findLines()
{
	local $b,@words,$index,$Line,$linesread,@indices,@dictionarylines;
	($Line) = @_;
	
# Get a list containing the hash code of each word on the line and
# sort it so that we can do a sequential search in the next phase:
	@dictionarylines = ();
	@indices = ();
	$Line =~ s/ h / /;									# Get rid of separated 'h's.
	$Line =~ s/[\(\),;!\?-]/ /g;				# Remove punctuation and replace with white space
	$Line =~ s/  / /g;									# Replace multiple spaces with single
	$Line =~ tr/A-Z/a-z/;								# Convert everything to lower case
	$Line =~ s/&cced;h/qh/g;            # Handle c cedille
	@words = split(/[\s]/,$Line);				# Split up result into component words
	foreach $b(@words) {								# Append number of this line to record.
		if ($b ne "") {										# Don't want null strings
			$index = getHash($b);						# Look up index
			push(@indices, sprintf("%6d %s",$index, $b));	# add it to set of indices
	 	}
 	}
 	@indices = sort {$a <=> $b} (@indices);					# now sort the list into ascending order ready for next phase.

# For each word and hash code, read the line corresponding to the 
# hash code and find out the position of the word that we're looking
# up on that line of the file:
 	open(READ, $dictionary);						# Open the file
	$linesread=0;												# and count lines from 0
 	foreach $entry(@indices) {					# For each word that we're looking up
		@items = split(/ +/,$entry);			# get the word and its hash code
		$word = pop(@items);
		$index = pop(@items);
		while ($index >= $linesread) {		# Go to the appropriate line
			$linesread++;										# keeping track of line numbers
			$Line = <READ>;
		}
		# Work out the index number for that word
		$offset = getIndexInRecord($word, $Line, $index);
		if ($offset >= 0) {
			# Found the word - add its index to set of indices.
			# Use leading zeroes to facilitate sorting later on.
			push(@dictionarylines, sprintf("%07d",$offset));
		}
 	}
	close(READ);

# Now we have a list of index numbers for the words in the line to be analysed.
# Sort index numbers into ascending order so that we can read the line numbers
# file sequentially.
	@dictionarylines = sort(@dictionarylines);
	
# For each index number there is a list of line numbers in which the
# corresponding word is to be found in the dictionary. Now that they're in
# ascending order we can search the file and pick the lines that we want.
	@dictionaryentries = ();							# Null list
	open(READ, $lineFile);								# Open list of lines
	$linesread=0;													# Zero line count
 	foreach $entry(@dictionarylines) {
		while ($entry >= $linesread) {			# Scan for next line
			$linesread++;
			$Line = <READ>;
		}
# Found it - convert data on line from compressed form and add it to set of indices
		push(@dictionaryentries, decompress($Line));
 	}
	close(READ);

# If there is more than one word in a line, we'll have a list for each word
# showing the lines where it is to be found. We want to find an entry in which
# ALL the words appear.
 	sort(@dictionaryentries);
 	if ($#dictionaryentries > 0) {				# More than one word in line
 		$lines = shift(@dictionaryentries);
	 	foreach $entry(@dictionaryentries) {
 			$lines = andSets($lines,$entry);
 		}
 	}
 	else {
 		$lines = shift(@dictionaryentries);
	}
	return $lines
}

# Convert packed record to a list of numbers @numbers
# Record is concatenation of 3-character groups
# Each character x in group has ASCII code 48<=x<=111  
sub decompress
{
	my $in, $out, $num, $i;
	($in) = @_;
	$out = "";
	while (length($in)>2) {
		for ($num=0,$i=0; $i<3; $i++) {
			$num = 64 * $num + (ord $in) - 48;
			$in =~ s/^.//;
		}
		$out .= " " . $num;
	}
	return $out
}

#---------------------------
# Call with three parameters:
#  (1) List of line numbers to get
#  (2) List of possible target words if looking for mutations
#  (3) 1 => looking for exact match, 0 => any match will do

sub showLinesFound()
{
	my $lines, $targets, @targets, @halves, $target, $Manx;
	($targets, $lines, $matchAny) = @_;
	@targets = split(/\&/,$targets);
	@linesRead = ();
	@sorted = ("&nbsp;");
	
# The string $lines is a list of the lines which contain entries matching
# the word that we are looking up. Look up the line corresponding to each entry.
# Read the file sequentially, and get each entry.
	@lines = split(/ +/,$lines);					# Split the string up into a list 
 	sort(@lines);													# Sort it so that we can do a sequential read
	if ($#lines) {												# if there are any entries to look up
		open(READ, $DICTIONARYFILE);
		$Line = <READ>;
		$linesread=1;
 		foreach $index(@lines) {
			if ($index ne "" && $index != -1) {
				while ($index > $linesread) {
					$Line = <READ>;
					$linesread++;
				}
				if ($linesread <= $DICTIONARYLINES || $phrases) {
					chomp($Line);
					if ($matchAny) {
						push(@linesRead, $Line);
					}
					else {
						$Manx = $Line;
						$Manx =~ s/.*#//;							# Get the Manx half of entry
						$Manx =~ s/[-`]//;
						foreach $target(@targets) {
							$target =~ s/[-`]//;
							if ($Manx eq $target) {
								push(@linesRead, $Line);
							}
						}
					}
				}
			}
		}
 		close(READ);
 		foreach $Line(@linesRead) {
			$Line =~ s/c\|h/chq/g;
			$Line =~ s/C\|h/Chq/g;
			$Line =~ s/#/\t/;
			if (!$E2M) {
				@halves = split(/\t/,$Line);
				$Line = sprintf("%s\t%s", @halves[1], @halves[0]);
			}
 		}
		@sorted = sort {uc($a) cmp uc($b)} @linesRead;
 		@linesRead=();
 		foreach $Line(@sorted) {
			if ($Line =~ /^.*[- ].*\t/) { push(@linesRead, $Line); }
			else       { unshift(@linesRead, $Line); }
 		}
		print "<HTML>\n<HEAD></HEAD>\n<BODY BGCOLOR=\"#000000\"><P><CENTER>";
	 	if ($E2M) {
	 		printf("<TABLE ALIGN=\"CENTER\">\n<TR><TD><FONT COLOR=\"#ff0000\" SIZE=\"-1\" FACE=\"Arial,Helvetica,Verdana\"><B>ENGLISH</TD><TD><FONT COLOR=\"#ff0000\" SIZE=\"-1\" FACE=\"Arial,Helvetica,Verdana\"><B>MANX");
	 	} else {
	 		printf("<TABLE ALIGN=\"CENTER\">\n<TR><TD><FONT COLOR=\"#ff0000\" SIZE=\"-1\" FACE=\"Arial,Helvetica,Verdana\"><B>MANX</TD><TD><FONT COLOR=\"#ff0000\" SIZE=\"-1\" FACE=\"Arial,Helvetica,Verdana\"><B>ENGLISH");
 		}
 		foreach $Line(@linesRead) {
			$Line =~ s/chq/\&ccedil;h/g;		# Change chq to įh
			$Line =~ s/Chq/\&Ccedil;h/g;		# Change Chq to Įh
			$Line =~ s/`//g;								# Remove ` characters
			# Format to make a line of HTML table
			@halves = split(/\t/,$Line);
			$latest = @halves[0];
			if ($previous eq $latest) {
				printf(", %s", @halves[1]);
			}
			else {
				printf("</TD></TR>\n<TR><TD><FONT COLOR=\"#ffff00\" SIZE=\"-1\" FACE=\"Arial,Helvetica,Verdana\">%s</TD><TD><FONT COLOR=\"#ffff00\" SIZE=\"-1\" FACE=\"Arial,Helvetica,Verdana\">%s", @halves[0], @halves[1]);
			}
			$previous = $latest;
 		}
		printf("</TD></TR>\n</TABLE>\n");
	}
 	else {
			print "<FONT COLOR=\"#ff0000\" FACE=\"Arial,Helvetica,Verdana\"><B>Sorry, no matching entry was found!\n";
 	}
	print "</BODY></HTML>\n";
}

#---------------------------
# Call with two strings containing space delimited numbers
# Returns a string containing numbers that appear in either list.
sub orSets()
{
	my $a,$b,$c;
	($a,$b) = @_;
	$c = $a . " " . $b;
	return uniqSet($c);
}

#---------------------------
# Call with a string containing space delimited numbers
# Returns string with same numbers sorted in ascending number
# and duplicate entries deleted.

sub uniqSet()
{
	local $a,$b;
	my $c,$i,@a;
	($a) = @_;
	@a = split(/ +/, $a);
	@a = sort {$a <=> $b} (@a);
	$c = @a[0];
	$b = $c;
	
	for ($i=1; $i<=$#a; $i++) {
		if (@a[$i] != $c) {
			$b = $b . " " . @a[$i];
		}
		$c = @a[$i];
	}
	return $b;
}

#---------------------------
# Call with two strings containing space delimited numbers
# Returns a string containing those numbers that appear in both lists.

sub andSets()
{
	my $c,$i,$j,@a,@b;
	local $a,$b;
	($a,$b) = @_;
	@a = sort {$a <=> $b} (split(/ +/,$a));
	@b = sort {$a <=> $b} (split(/ +/,$b));
	for ($c = '', $i=0; $i<=$#a; $i++) {
		for ($j=0; $j<=$#b; $j++) {
			if (@a[$i]==@b[$j]) {
				$c = $c . " " . @a[$i];
			}
		}
	}
	return $c;
}

#---------------------------
# getIndexInRecord($word, $Line, $index)
# Work out number corresponding to this word by counting
# along the current line until a match is found.
# Return the number if a match is found, -1 if not.
sub getIndexInRecord
{
	my $word, $Line, @entries, $entry, $i;
	
	($word, $Line, $hash) = @_;
	@entries = split(/-/,$Line);	# Split up to get rid of leading number
	$Line = shift(@entries);
	$Line = shift(@entries);
	@entries = split(/ /,$Line);	# Split up words in record
# Scan entries on line looking for a match.
# If found, derive return code from position of word
	$i=0;
	foreach $entry(@entries) {
		if ($word eq $entry) { return $i+$hash; }
		$i += $HASHBASE;
	}
	return -1;
}

#---------------------------
# $hashcode = gethash($word)
# Return a hash code for this word (0 <= $hash <= 1000)
# This hash function mustn't be changed as it is the same
# one used to computer tables!
sub getHash
{
	my $hash;
	my ($b) = @_;
	while ($b ne "") {
		$hash = $hash*17 + ord($b); # 17 and 1001 co-prime
		$hash %= 1001;
		$b =~ s/^.//;
	}
	return $hash;
}

sub logRequest()
{
  $now = localtime();
# Cut off time and year to give day and date.
# In theory this means that we could alias
# every six years or so, but honestly...
	$today = substr($now,0,10);

# Lock the log file and read it to see if it contains old entries.
# If there are entries from previous days, read the whole file.
# We shouldn't need to write back any of today's data except our
# query because this action should only be carried out once per day.
	flock(LOG,LOCK_EX);
  open(LOG, $LOGFILE);
#  $then = <LOG>;
#  $then = substr($then,0,10);
  $then = substr(<LOG>,0,10);
  $doUpdate = $today ne $then;
  close(LOG);
  if ($doUpdate) {						# OK - read current file contents to @lines
  	open(LOG, $LOGFILE);
	  @lines = <LOG>;
		close(LOG);
		open(LOG, ">$LOGFILE");		# and reopen it for writing
	} else {
		open(LOG, ">>$LOGFILE");	# otherwise open it for appending
	}
# Now either write as the first record of a new day's log file or
# append a record to an existing file
	print LOG "$now,$E2M,$any,$mutate,$phrases,$in{'word'},";
	print LOG "$ENV{'REMOTE_ADDR'},";
	print LOG "$ENV{'HTTP_USER_AGENT'},";
  print LOG "$ORIGIN\n";
# Finished working with the log file so close and unlock it for anybody else to play with
  close(LOG);
  flock(LOG,LOCK_UN);

  if ($doUpdate) {						# A new day, so set about constructing an email
		if ($PC) {								# running on PC; send "mail" to dump file
			open (MAIL, ">>maildump.txt");
		} else {									# running on server - do a real mailing
			open (MAIL, "|$MAILPROG $recipient");
		}
		print MAIL "Subject: Manx dictionary Log\n";
		print MAIL "To: $recipient\n";
		print MAIL "Reply-to: $mail_from\n";
		print MAIL "From: Manx Dictionary\n\n";
	  chomp @lines;
	  foreach $record(@lines) {
    	print MAIL "$record\n";
    }
		close (MAIL);
	}
}