Home Contact
Validate the HTML of this page

create-dict.pl

source code

#!/m1/shared/bin/perl -w

#  Perl script: create-dict.pl
#
#  2009, Michael Doran, doran@uta.edu
#
#  University of Texas at Arlington Libraries
#  Box 19497, Arlington, TX 76019, USA
#  
#  This script creates a dictionary file of terms
#  extracted from a Voyager database for use with
#  an OPAC spell check program. 
#
#  Terms are UTF-8 encoded.

use strict;
use Encode;
use Unicode::Normalize;

use DBI;

$ENV{ORACLE_SID} = "VGER";
$ENV{ORACLE_HOME} = "/oracle/app/oracle/product/10.2.0/db_1";

my $username = "********";
my $password = "********";
my $db_name  = "xxxdb";

#  The level related to the number ("greater than") of
#  OPAC bib occurances of heading.  The higher the number,
#  the smaller (and more focused) the dictionary file.
#  Must be a non-negative integer (i.e. 0,1,2,3...)
#  Recommended level is 2

my $level    = "2";

########################################################################
#
#    * * * * * * * *    Stop editing here!    * * * * * * * *
#
########################################################################
#
#  Most Voyager sites should not have to edit code beyond this point.
#
########################################################################

if ($level =~ /\D/) { 
    $level = "0";
}

my $out_file = "dictionary.$level";
my $out_junk = "rejected.$level";

print "  Level: $level (higher the level, the smaller the dictionary file)" . "\n";
print "  This will take a few minutes..." . "\n";

my ($dbh,$sth);

open (OUTFILE, ">$out_file")
        || die "Cannot create/open output file: $!";
binmode(OUTFILE, ":utf8");

open (JUNK, ">$out_junk")
        || die "Cannot create/open output file: $!";
binmode(JUNK, ":utf8");

GetList();

############################################################
#  GetList
############################################################

sub GetList {
    ConnectVygrDB();
    my $sth = $dbh->prepare(ConstructSQL())
        || die $dbh->errstr;

    $sth->execute
        || die $dbh->errstr;

    my $count = 0;
    my $word_count = 0;
    my ($heading);
    my @uniq = ();
    my %seen = ();
    while ( $heading = $sth->fetchrow_array ) {
        # Let Perl know this stuff is UTF-8 encoded
        $heading = decode('UTF-8', $heading);
        # Normalize: Unicode Normalization Form D (canonical decomposition)
        # i.e. decompose any precomposed accented characters
        $heading = NFD($heading);
        # replace dash with space 
        $heading =~ s/\-/ /g;
        # replace any non-word characters with spaces 
        $heading =~ s/[^\p{L}^\p{M}^\s]/ /g;
        # remove any digits 
        # $heading =~ s/\d//g;
        # remove any words with digits 
        $heading =~ s/\b(\D*?\d+?\D*?)\b//g;
        # remove words with two or less characters 
        $heading =~ s/\b[\p{L}\p{M}]{1,2}\b//g;
        # remove all-uppercase words 
        #$heading =~ s/\b([\p{Lu}\p{M}]+)\b//g;
        # remove leading and trailing whitespace
        $heading =~ s/^\s+//;
        $heading =~ s/\s+$//;
        # collapse multiple whitespace
        $heading =~ s/\s{2,}/ /g;
        my @tokens = ();
        my $iteration = 0;
        my $tokenized_heading = $heading;
        while ($tokenized_heading) {
            if ($tokenized_heading =~ s/^(\S*)\s// ) {
                push(@tokens, $1);
                $word_count++;
            } elsif ($tokenized_heading =~ s/^(\S*)$// ) {
                push(@tokens, $1);
                $word_count++;
            }
            foreach my $term (@tokens) {
                push(@uniq, $term) unless $seen{$term}++;
            }
            # Limit loop in case there is some strange input
            if ($iteration >= 30) {
                last;
            }
            $iteration++;
        }
        #if ($heading) {
        #    print OUTFILE $heading . "\n";
        #}
        $count++;
    }
    if (@uniq) {
        foreach my $dict_word (@uniq) {
            if ($dict_word =~ /\b\p{Lu}+\b/) {
                print JUNK $dict_word . "\n";
            } elsif ($dict_word =~ /.+\p{Lu}{2,}.*/) {
                print JUNK $dict_word . "\n";
            } else {
                print OUTFILE $dict_word . "\n";
            }
        }
    } else {
        print "Hmmm... no unique terms?" . "\n";
    }
    print "  Dictionary terms file: " . $out_file  . "\n";
    print "  Rejected terms file:   " . $out_junk . "\n";
    DisconnectVygrDB();
}

############################################################
#  ConstructSQL
############################################################

sub ConstructSQL {
    return ("
    select distinct
	heading.display_heading
    from
        $db_name.heading
    where
        heading.index_type in ('N','S') and
        heading.opacbibs > $level
    ");
}


############################################################
#  ConnectVygrDB
############################################################
#
#  Connects to the Voyager database
#  (in read-only mode, natch!)

sub ConnectVygrDB {
    $dbh = DBI->connect('dbi:Oracle:', $username, $password)
        || die "Could not connect: $DBI::errstr";
}

############################################################
#  DisconnectVygrDB
############################################################
#
#  Exits gracefully from the Voyager database

sub DisconnectVygrDB {
    if ($sth) {
        $sth->finish;
    }
    $dbh->disconnect;
}

close (OUTFILE);
close (JUNK);
exit(0);