create-dict.pl
source code
#!/usr/local/bin/perl -w
# Perl script: create-dict.pl
#
# 2009, Michael Doran, doran@uta.edu
#
# University of Texas at Arlington Libraries
# Box 19497, Arlington, TX 76019, USA
#
# This script creates a dictionary file of terms
# extracted from a Voyager database for use with
# an OPAC spell check program.
#
# Terms are UTF-8 encoded.
use strict;
use Encode;
use Unicode::Normalize;
use DBI;
$ENV{ORACLE_SID} = "VGER";
$ENV{ORACLE_HOME} = "/oracle/app/oracle/product/10.2.0/db_1";
my $username = "********";
my $password = "********";
my $db_name = "xxxdb";
# The level related to the number ("greater than") of
# OPAC bib occurances of heading. The higher the number,
# the smaller (and more focused) the dictionary file.
# Must be a non-negative integer (i.e. 0,1,2,3...)
# Recommended level is 2
my $level = "2";
########################################################################
#
# * * * * * * * * Stop editing here! * * * * * * * *
#
########################################################################
#
# Most Voyager sites should not have to edit code beyond this point.
#
########################################################################
if ($level =~ /\D/) {
$level = "0";
}
my $out_file = "dictionary.$level";
my $out_junk = "rejected.$level";
print " Level: $level (higher the level, the smaller the dictionary file)" . "\n";
print " This will take a few minutes..." . "\n";
my ($dbh,$sth);
open (OUTFILE, ">$out_file")
|| die "Cannot create/open output file: $!";
binmode(OUTFILE, ":utf8");
open (JUNK, ">$out_junk")
|| die "Cannot create/open output file: $!";
binmode(JUNK, ":utf8");
GetList();
############################################################
# GetList
############################################################
sub GetList {
ConnectVygrDB();
my $sth = $dbh->prepare(ConstructSQL())
|| die $dbh->errstr;
$sth->execute
|| die $dbh->errstr;
my $count = 0;
my $word_count = 0;
my ($heading);
my @uniq = ();
my %seen = ();
while ( $heading = $sth->fetchrow_array ) {
# Let Perl know this stuff is UTF-8 encoded
$heading = decode('UTF-8', $heading);
# Normalize: Unicode Normalization Form D (canonical decomposition)
# i.e. decompose any precomposed accented characters
$heading = NFD($heading);
# replace dash with space
$heading =~ s/\-/ /g;
# replace any non-word characters with spaces
$heading =~ s/[^\p{L}^\p{M}^\s]/ /g;
# remove any digits
# $heading =~ s/\d//g;
# remove any words with digits
$heading =~ s/\b(\D*?\d+?\D*?)\b//g;
# remove words with two or less characters
$heading =~ s/\b[\p{L}\p{M}]{1,2}\b//g;
# remove all-uppercase words
#$heading =~ s/\b([\p{Lu}\p{M}]+)\b//g;
# remove leading and trailing whitespace
$heading =~ s/^\s+//;
$heading =~ s/\s+$//;
# collapse multiple whitespace
$heading =~ s/\s{2,}/ /g;
my @tokens = ();
my $iteration = 0;
my $tokenized_heading = $heading;
while ($tokenized_heading) {
if ($tokenized_heading =~ s/^(\S*)\s// ) {
push(@tokens, $1);
$word_count++;
} elsif ($tokenized_heading =~ s/^(\S*)$// ) {
push(@tokens, $1);
$word_count++;
}
foreach my $term (@tokens) {
push(@uniq, $term) unless $seen{$term}++;
}
# Limit loop in case there is some strange input
if ($iteration >= 30) {
last;
}
$iteration++;
}
#if ($heading) {
# print OUTFILE $heading . "\n";
#}
$count++;
}
if (@uniq) {
foreach my $dict_word (@uniq) {
if ($dict_word =~ /\b\p{Lu}+\b/) {
print JUNK $dict_word . "\n";
} elsif ($dict_word =~ /.+\p{Lu}{2,}.*/) {
print JUNK $dict_word . "\n";
} else {
print OUTFILE $dict_word . "\n";
}
}
} else {
print "Hmmm... no unique terms?" . "\n";
}
print " Dictionary terms file: " . $out_file . "\n";
print " Rejected terms file: " . $out_junk . "\n";
DisconnectVygrDB();
}
############################################################
# ConstructSQL
############################################################
sub ConstructSQL {
return ("
select distinct
heading.display_heading
from
$db_name.heading
where
heading.index_type in ('N','S') and
heading.opacbibs > $level
");
}
############################################################
# ConnectVygrDB
############################################################
#
# Connects to the Voyager database
# (in read-only mode, natch!)
sub ConnectVygrDB {
$dbh = DBI->connect('dbi:Oracle:', $username, $password)
|| die "Could not connect: $DBI::errstr";
}
############################################################
# DisconnectVygrDB
############################################################
#
# Exits gracefully from the Voyager database
sub DisconnectVygrDB {
if ($sth) {
$sth->finish;
}
$dbh->disconnect;
}
close (OUTFILE);
close (JUNK);
exit(0);