#!/usr/bin/perl
# ^^^^^^^^^^^^^ modify the path to point to perl version >=5

$frmfile="~/heise/inhalt.frm";
#         ^^^^^^^^^^^^^^^^^^ modify the path to point to your INHALT.FRM
# or use switch -f to set path
# or set environment variable HEISE

##################################################################
# reg - textonly search in Heise-Register
#
# Version        2.1.1
# Created:       11.11.1997
# Last modified: 21.01.1998
#
# Copyright (C) 1997 Andy Spiegl, Jan Starzynski
#
# Andy Spiegl (Andy.Spiegl@writeme.com)
#   Munich, Germany
# Jan Starzynski, Planet GmbH (jan@planet.de)
#   Schwerin, Germany
##################################################################
#     BUG REPORTS AND SUGGESTIONS ARE WELCOME!
##################################################################

##################################################################
# The initial idea and some minor enhancements came from Andy,
# but most of the work and all those nifty features were done
# by Jan.  Thanks a lot, Jan!
##################################################################

##################################################################
# VERSION HISTORY:
#
# 0.9.1:  first public release (Andy)
# 1.0.0:  extended and accelerated version: (Jan)
#          - concatenating lines of an entry and 1 patternmatch 
#            (instead of 9 for every line)
#          - precompiling patterns using eval
#          - multiple arguments possible, connected via "and"
#            allows: reg "viren|virus" "c97" to search for
#            all articles about virusses in the c't of the year 97
#          - nicer output
# 1.0.1:  minor changes ( help text, comments ) (Andy)
# 1.1.0:  writing default format file if none found (Andy)
#         print out number of matches, umlaut conversion (Andy)
# 1.2.0:  automatic umlaut type detection
#         speed improvement of conversion  (Jan)
# 1.9.0   phonetic search (Jan)
# 2.0.0   switch for regular expressions and case conversion,
#         configuration through format-file (Jan)
# 2.0.1   bugfixes
#         automatic detection of case-ignore
#         check of variables in config-file
#         empty entries in databank are not shown 
#         output more compact and nicer (Jan)
# 2.1.0   bugfix
#         speedup
#         update register with cti-file
#         check update-file for correct contents
#         remove duplicated entries
# 2.1.1   bugfix
##################################################################

#########################################################################
# This program is free software; you can redistribute it and/or modify  #
# it under the terms of the GNU General Public License Version 2 as     #
# published by the Free Software Foundation.                            #
#                                                                       #
# This program is distributed in the hope that it will be useful,       #
# but WITHOUT ANY WARRANTY; without even the implied warranty of        #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         #
# GNU General Public License for more details.                          #
#                                                                       #
# You should have received a copy of the GNU General Public License     #
# along with this program; if not, write to the Free Software           #
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.             #
#########################################################################


#########################################################################
# $Id: heise.pl,v 1.8 1998/01/20 13:11:30 jan Exp $
#
# $Log: heise.pl,v $
# Revision 1.8  1998/01/20 13:11:30  jan
# Beschreibung up to date
#
# Revision 1.7  1998/01/20 13:10:26  jan
# bugfixes
#
# Revision 1.6  1998/01/19 19:22:36  jan
# bugfix
#
# Revision 1.5  1998/01/19 19:05:53  jan
# checking des Eingabe-files eingebaut
#
# Revision 1.4  1998/01/15 10:08:09  jan
# *** empty log message ***
#
# Revision 1.3  1998/01/15 10:05:39  jan
# Bugfix.
# Beschleunigung um ca. 30%.
# Einfgen von updates.
# Entfernen von dupes.
#
# Revision 1.2  1997/12/18 14:03:30  jan
# Kosmetik
#
# Revision 1.1.1.1  1997/12/18 13:55:44  jan
# Perl-Sources
#
# Revision 1.2  1997/12/18 13:46:00  jan
# Kosmetik
#
# Revision 1.1  1997/12/18 12:01:42  jan
# Durchsuchen des Heise-Registers
#
#########################################################################

require 5; # Perl 5 needed

%config = ("-1" => "auto", "0" => "off", "1" => "on");
%config = (%config, reverse %config); # works in both directions

# configurable yes/no/auto-variables, will be used as symbolic references
@config = ('umlautConversion', 'phonetic', 'ignoreCase', 'regex', 'updateCheck', 'updateDupe');

# abbreviation -> journal-name
%journal = ('c' => 'c\'t', 'i' => 'iX', 'g' => 'Gateway');

$debug = 0;
$umlautConversion = -1;		# default is automatic
$phonetic = 0;                  # default is off
$ignoreCase = -1;               # default is automatic
$regex = -1;                    # default is automatic

# translation-tables: DOS to ISO
$doscase = "\\201\\204\\224\\216\\231\\232\\236\\341\\335\\015\\032";
$isocase = "                                "; # delete <CR> and ^Z

$isocase =~ tr/ \t//d; # spaces for readability only, delete them

# translation-tables: lower to UPPER to phonetic
$lowercase  = "a-z";
$uppercase  = "A-Z";
$germancase = "A-ZAAAAACEEEEIIIIDNOOOOUUUYP"; # it's my (Jan's) guess!

$HOME=$ENV{HOME};

$pagewidth = 79;

# change value to standard
# expects a reference as argument (or in $_)
sub stdval {
  my $ref = @_ ? shift : $_; # $_ as default
  my $val = $$ref; # value
  
  if ($val =~ /^-?\d+$/) { # numbers
    $$ref = -1, return if $val < 0;
    $$ref = 1, return if $val > 0;
    return;
  }

  $val = lc $val; # lowercase
  if(exists $config{$val}) { # config strings
    $$ref = $config{$val};
    return;
  }

  warn "\aThe value of '$ref' has to be: 'on', 'off', 'auto' or a number!\n";
  warn "The current value is: '$val', setting it to 'auto'!\n";
  $$ref = -1;
  return;
}

# print out configuration
sub printconfig {
  local *F = *STDOUT; # STDOUT as default
  
  *F = shift if @_; # passed FileHandle otherwise
  
  print F "# German field titles:\n";
  for($i = 0; $i < 9; $i++) {
    printf F "%2d %s\n", $i + 1, $fields[$i];
  }
  print F "\n";
  $updateCheck = 1 if not defined $updateCheck;
  $updateDupe = 1 if not defined $updateDupe;
  print F "# Configuration data: case is important!\n";
  foreach (@config) {
    $$_ = -1 unless defined $$_;
    stdval;
    printf F "%-20s %s\n", $_, $config{$$_};
  }
}

# print usage of program
sub usage {
  print "$0: Search through the c't contents register\n";
  print "usage: [-f FRM-file] [-u CTI-file [-check on|off]] [-dup on|off] [-iso on|off] [-ign on|off]\n";
  print "       [-phon on|off] [-re on|off] [-db level] searchstring ...\n";
  print "Try `$0 -h' for more information.\n";
  
  exit($_[0]) if @_;
}

# print extended help
sub help {
  usage;
  print "\n";
  print "You can use the following options:\n";
  print " -f FRM-file    FRM-file to search in\n";
  print " -u CTI-file    update FRM-file with CTI-file\n";
  print " -check on|off  check input file for errors\n";
  print " -dup on|off    remove doubled entries (can be memory intensive!)\n";
  print " -iso on|off    umlaut conversion to iso (default: auto)\n";
  print " -ign on|off    ignore case (default: auto)\n";
  print " -phon on|off   phonetic search (default: off)\n";
  print " -re on|off     regular expressions (default: auto)\n";
  print " -db level      print debugging output (level: 1 - 4)\n";
  print "\n";
  print "The following environment variables are used if defined:\n";
  print " HEISE  : path to the FRM-file\n";
  print " COLS   : linewidth (normally detected automatically)\n";
  print " TERMCAP: automatic detection of linewidth\n";
  print " HOME   : path to the home directory\n";
  print "\n";
  print "NOTE: If more than one searchstring is given, only those entries\n";
  print "      are listed that contain ALL of them.  To find EITHER\n";
  print "      use regular expressions like: \"string1|string2|string3\".\n";
  
  exit($_[0]) if @_;
}

# abbr. in register => readable format
sub journal_name {
  local $^W = 0;
  my ($journal, $month, $page) = map { tr/ \t\r\n//d; $_ } @_;

  $journal{lc substr($journal, 0, 1)}.' '.($month + 0)."/".(substr($journal, 1) + 0).', S. '.($page + 0);
}

# set format file from env
if (exists $ENV{HEISE}) {
  $frmfile = $ENV{HEISE};
}

# set pagewidth for output (subtract one, just to be sure)
if (exists $ENV{COLS}) {
  $pagewidth = $ENV{COLS} - 1;
}
elsif (exists $ENV{TERMCAP}) {
  if ($ENV{TERMCAP} =~ /:co\#(\d+):/) {
    $pagewidth = $1 - 1;
  }
}

usage 1 unless @ARGV;

# treat argument line
while (@ARGV) {
  $i = shift @ARGV;
  if (($i eq "-h") || ($i eq "--help")) {
    help 0;
  } elsif ($i eq "-f") {
    $frmfile = shift @ARGV;
  } elsif ($i eq "-iso") {
    $iso = shift @ARGV;
  } elsif ($i eq "-phon") {
    $phon = shift @ARGV;
  } elsif ($i eq "-ign") {
    $ign = shift @ARGV;
  } elsif ($i eq "-re") {
    $re = shift @ARGV;
  } elsif ($i eq "-db") {
    $debug = shift @ARGV;
  } elsif ($i eq "-debug") {
    $debug = shift @ARGV;
  } elsif ($i eq "-u") {
    $update = shift @ARGV;
  } elsif ($i eq "-dup") {
    $dup = shift @ARGV;
  } elsif ($i eq "-check") {
    $check = shift @ARGV;
  } elsif ($i =~ /^-/) {
    print "unknown option: $i\n";
    usage 1;
  } else {
    push (@searchterm, $i);
  }
}

$frmfile =~ s/^~/$HOME/;

# set default values
$fields[0]="Titel       :";
$fields[1]="Untertitel  :";
$fields[2]="Autor(en)   :";
$fields[3]="Redakteur   :";
$fields[4]="Seite       :";
$fields[5]="Ausgabe     :";
$fields[6]="Zeitschrift :";
$fields[7]="Querverweise:";
$fields[8]="Schlagwrter:";

# Load dataformatfile
$frmfilefmt = "$frmfile.fmt";
unless (open (FMT, "<$frmfilefmt") ) {
  print ("Can't open $frmfilefmt: $!.\n");
  print (" --> Writing default format file for next time.\n");
  print ("     Check it out if you want a different language.\n");
  
  unless (open (FMT_DEF, ">$frmfilefmt") ) {
    print ("Can't write to $frmfilefmt: $!.\n");
    print ("To avoid this error create a file $frmfilefmt with these lines:\n");
    printconfig;
  } else {
    printconfig *FMT_DEF;
    close FMT_DEF;
  }
} else {
  # read format file
  if ($debug > 0) { print "Reading formatfile: $frmfilefmt\n";}
  while (<FMT>) {
    s/\#.*//;			# erase comments
    if(/^\s*(\d+)\s+(.*)/) {		# extract key and value
      $fields[$1 - 1] = $2;
    } elsif (/^\s*(\w+)\s+(-?\w+)\s*$/i) {# extract configuration
      ($ref, $val) = ($1, $2);
      
      warn "\a*** Unknown variable '$ref' in config-file! ***\n"
	unless grep /^$ref$/, @config;

      $$ref = $val;
    }
  }
  close FMT;

  if(not defined $updateCheck or not defined $updateDupe) {
    print "new values for configuration-file: updateCheck and updateDupe\n",
          "saving for next time\n";
    if(open(FH, ">$frmfilefmt")) {
      printconfig *FH;
      close FH;
    } else {
      warn "cannot write to $fmtfilefrm: $!\n";
    }
  }
}

# set configuration from argument line
$umlautConversion = $iso if defined $iso;
$phonetic = $phon if defined $phon;
$ignoreCase = $ign if defined $ign;
$regex = $re if defined $re;
$updateCheck = $check if defined $check;
$updateDupe = $dup if defined $dup;

# set on/off/auto values to 0/1/-1
$dup = 0 unless defined $dup;
stdval \$dup;
foreach (@config) {
  stdval;
}

if ($debug > 2) {
  print "Fields:\n";
  for ($i = 0; $i < 9; $i++) {
    print " $i: $fields[$i]\n";
  }
}

# $ignoreCase is done implicitely by $phonetic.
$ignoreCase = 0 if $phonetic;

# build code for phonem
if ($phonetic) {
  $code = "";
  
  $code .= <<'PHONEM_START';
# convert string to GERMAN phonetic equivalent
    sub phonem
    {
	my $string = @_ ? $_[0] : $_;

# lower -> UPPER und  etc. to german equivalent
PHONEM_START
    $code .= "        \$string =~ tr/$lowercase$uppercase/$germancase$germancase/;\n";

    $code .= <<'PHONEM_END';
# replace several double characters
	$string =~ s/(?:S[CZ])|(?:CZ)|(?:T[ZS])|(?:DS)/C/g;
	$string =~ s/KS/X/g;
	$string =~ s/QU/CV/g;
	$string =~ s/P[FH]/V/g;
	$string =~ s/UE/Y/g;
	$string =~ s/AE/E/g;
	$string =~ s/OE//g;
	$string =~ s/E[IY]/AY/g;
	$string =~ s/OU/U/g;
    
# replace single characters
	$string =~ tr/ZKGQIJFWPT\t/CCCCEYYYSVVBD /;
    
# delete forbidden characters, RE remain possible!
	$string =~ tr/ABCDELMNORSUVWXY 0-9!$%&\/()=?\`\'^{[]}\\+~*\#\-_.:,;<>|\n//cd;
    
# delete multiple letters
	$string =~ tr/ABCDELMNORSUVWXY //s;

	$string;
    }
PHONEM_END

  if($debug >= 3) {
    print $code;
  }

  eval $code;
  if($@) {
    print "error compiling phonem: $@\n";
    print "$code\n";
    exit(1);
  }
}

# insert update
if (defined $update) {
  local $| = 1; # autoflush

  $dup = $updateDupe;
  $check = $updateCheck;

  if($check) {
    # check update-file for errors
    print "checking $update ...";
    open (UPD, $update) or die "cannot open $update: $!\n";
    $journals = join '', keys %journal;
    $journals = uc ($journals) . lc ($journals);
    $line = 0;
    while(<UPD>) {
      last if $line == 4 && ! /^\s*\d+\r?$/;
      last if $line == 5 && ! /^[\s01]\d\r?$/;
      last if $line == 6 && ! /^[$journals]\d\d(?:\d\d)?\r?$/o;
      $line = 0 if ++$line == 9;
    }
    die " something's wrong in $update line $., exiting\n" if $line != 0;
    close UPD;
    print " done\n";
  }

  # update register
  print "updating ...";
  open (UPD, $update) or die "cannot open $update: $!\n";
  open (FRM, ">>$frmfile") or die "cannot open $frmfile for update: $!\n";
  print FRM while(<UPD>);
  close FRM;
  close UPD;
  print " done\n";
  exit(0) unless $dup or @searchterm;
}

# remove dupes
if($dup) {
  local $| = 1;
  print "removing dupes ";
  rename $frmfile, "$frmfile~" or die "cannot backup $frmfile: $!\n";
  open (FRM, "$frmfile~") or die "cannot open $frmfile~: $!\n";
  open (OUT, ">$frmfile") or die "cannot write $frmfile: $!\n";
  my %dupe = ();
  my $count = 0;
  my $keep = 0;
  until(eof(FRM)) {
    $accu = 
      <FRM>. # 0
      <FRM>. # 1
      <FRM>. # 2
      <FRM>. # 3
      <FRM>. # 4
      <FRM>. # 5
      <FRM>. # 6
      <FRM>. # 7
      <FRM>; # 8

    print "." if ++$count % 1000 == 0;
    next if $dupe{$accu}++;

    ++$keep;
    print OUT $accu;
  }

  close OUT;
  close FRM;
  my $dupes = $count - $keep;
  print " done ($dupes/$count)\n";
  exit(0) unless @searchterm;
}

# make sure the database exists
if (! open (FRM, "<$frmfile") ) {
  print ("Can't open $frmfile: $!.\n");
  usage 2;
}

# automatic detection of case-ignore
if($ignoreCase < 0) {
  $ignoreCase = 1;
  foreach $searchterm (@searchterm) {
    # searchterm contains upper letter
    my $upper = "[$uppercase]";
    if($searchterm =~ /$upper/)	{
      $ignoreCase = 0;
      last;
    }
  }
}

#automatic detection of Umlaut-type
if( $umlautConversion < 0) {
  # DOS has <cr> at end of line
  $umlautConversion = <FRM> =~ /\r$/;
  seek FRM, 0, 0;		# back to beginning of file
}

# automatic detecton of regular expressions
if($regex < 0) {
  $regex = 0;
  foreach $searchterm (@searchterm) {
    if(quotemeta($searchterm) ne $searchterm) {
      $regex = 1;
      last;
    }
  }
}

# print search parameters
if (@searchterm) {
  print "=" x $pagewidth, "\nSearch options:\n";
  print "umlaut conversion  : $config{$umlautConversion}\n";
  print "ignore case        : $config{$ignoreCase}\n";
  print "phonetic search    : $config{$phonetic}\n";
  print "regular expressions: $config{$regex}\n";
  print "debug level        : $debug\n" if $debug;
  print "-" x $pagewidth, "\nSearching for: @searchterm\n", "=" x $pagewidth,"\n";
} else {
  print "You didn't tell me what to search for! Now what?\n";
  usage 3;
}

if ($debug > 2) {
  print "Searchterm: @searchterm\n";
  print "Frmfile: $frmfile\n";
}

# convert searchterms to phonem/uppercase
foreach (@searchterm) {
  # save control codes
  s/\\(.)/'\\__::__' . ord($1) . '__::__'/eg
    if $regex;

  # convert pattern to phonetic
  $_ = &phonem
    if($phonetic);
  
  # convert pattern to uppercase
  eval "tr/$lowercase/$uppercase/;"
    if ($ignoreCase);
  
  # restore control codes
  s/\\__::__(\d+)__::__/'\\' . chr($1)/eg
    if $regex;

  # quote \ and '
  s/([\\\'])/\\$1/g
    unless $regex;
}

# Load register file and perform search
open (FRM, "<$frmfile") or
  die ("Can't open $frmfile: $!.\nMaybe use -f switch.\n");

if ($debug > 0) {
  print "Reading register file: $frmfile\n";
}

# build dynamic code for search
$code = "";

# beginning of while-loop
$code .= <<'START_OF_WHILE';
$numMatches = 0;
until (eof(FRM))
{
  $accu = <FRM>. # 0
          <FRM>. # 1
          <FRM>. # 2
          <FRM>. # 3
          <FRM>. # 4
          <FRM>. # 5
          <FRM>. # 6
          <FRM>. # 7
          <FRM>; # 8

START_OF_WHILE

# umlaut-conversion on demand
$code .= "  \$accu =~ tr/$doscase/$isocase/d;\n"
    if($umlautConversion);

# integration of phonetic search
$varname = '$accu';
if($phonetic) {
  $code .= '  $trans = &phonem ($accu);'. "\n";
  $varname = '$trans';
}

# //i ignores umlaute, so we do it explicitly (and faster!!!)
if($ignoreCase) {
  $code .= "  \$trans = $varname;\n"
    if $varname ne '$trans';
  $code .= "  \$trans =~ tr/$lowercase/$uppercase/;\n";
  $varname = '$trans';
}

# start pattern matching
$code .= "  if(\n";

# put in patterns
if ($regex) {
  # using regular expressions
  while(@searchterm) {
    $searchterm = shift(@searchterm);
    $searchterm =~ s|/|\\/|g;	# take care of / in searchstrings
    $code .= "     $varname =~ /$searchterm/m";
    $code .= "\n     &&\n" if @searchterm;
  }
} else {
  # using ordinary strings
  while(@searchterm) {
    $searchterm = shift(@searchterm);
    $code .= "     index($varname, \'$searchterm\') >= $[";
    $code .= "\n     &&\n" if @searchterm;
  }
}

# end of while-loop: splitting of entries, print out etc.
$code .= <<'END_OF_WHILE';

    )
  {
    $numMatches++;
    # split into array for output
    @entry = split (/\n/, $accu);
    $entry[6] = journal_name($entry[6], $entry[5], $entry[4]);
    foreach $i (0, 1, 2, 3, 6, 7, 8, 9)
    {
      chomp($entry[$i]);
      write if $entry[$i];
    }
    print "=" x $pagewidth,"\n";
  }
}
END_OF_WHILE

# find longest entry in @fields
$len = 0;
foreach (@fields) {
  $l = length;
  $len = $l if $l > $len;
}

# construct format of a line
$code .=
    "\nformat STDOUT = \n" .
    "@" . "<" x $len . "^" . "<" x ($pagewidth - $len - 2) . "\n" .
    '$fields[$i], $entry[$i]' . "\n" .
    "~~". " " x ($len - 1). "^" . "<" x ($pagewidth - $len - 2) . "\n" .
    '$entry[$i]' . "\n.\n";

if($debug >= 3) {
  print $code, "\n";
}

# execute the code
eval $code;

# evaluate possible errors
if($@) {
  print "$code\n";
  print "Error in generated code:\n$@\n";
  exit(1);
}

close(FRM);

printf "Done. Found %d match%s.\n", $numMatches, $numMatches != 1 ? "es" : "";
print "=" x $pagewidth,"\n";

