#!/usr/local/bin/perl -w
########################################################################
#
# cnp-urdu.pl : "Create 'n' populate URDU"
#
# An URDU database program
#
# Version: 3.0
#
# 2002-2003, Michael Doran, doran@uta.edu
#
# University of Texas at Arlington Libraries
# Box 19497, Arlington, TX 76019, USA
#
# This program creates a MySQL database called 'urdu' and
# populates it with data derived from character set files
# available from the world wide web.
#
########################################################################
use strict;
use DBI;
my $database = "urdu";
&IntroSpiel;
############################################################
# IntroSpiel
############################################################
sub IntroSpiel {
print <<EOSTUFF
This program creates a MySQL database called '$database'
and populates the tables with data from external files.
EOSTUFF
}
############################################################
# Gather MySQL user information
############################################################
print " Enter your MySQL username: ";
my $username = <STDIN>;
chomp $username;
print " Enter your MySQL password: ";
my $password = <STDIN>;
chomp $password;
############################################################
# Gather information on data files
############################################################
my $answer;
my %marc_8_tables = (
"marc-8-latin-basic.txt" => "marc_8_latin_basic",
"marc-8-latin-ext.txt" => "marc_8_latin_ext",
"marc-8-arabic-basic.txt" => "marc_8_arabic_basic",
"marc-8-arabic-ext.txt" => "marc_8_arabic_ext",
"marc-8-cyrillic-basic.txt" => "marc_8_cyrillic_basic",
"marc-8-cyrillic-ext.txt" => "marc_8_cyrillic_ext",
"marc-8-greek-basic.txt" => "marc_8_greek_basic",
"marc-8-hebrew-basic.txt" => "marc_8_hebrew_basic",
"marc-8-controls.txt" => "marc_8_controls"
);
my $marc_file = "charconv.sgm";
print " MARC data file [$marc_file]: ";
$answer = <STDIN>;
chomp $answer;
if ($answer) {
$marc_file = $answer;
}
my $marc_control_file = "marc-control.txt";
print " MARC control functions data file [$marc_control_file]: ";
$answer = <STDIN>;
chomp $answer;
if ($answer) {
$marc_control_file = $answer;
}
my $iso8859_1_file = "aix-ISO8859_1-4.3.6.xml";
print " ISO8859_1 data file [$iso8859_1_file]: ";
$answer = <STDIN>;
chomp $answer;
if ($answer) {
$iso8859_1_file = $answer;
}
my $sgml_file = "SGML.TXT";
print " SGML data file [$sgml_file]: ";
$answer = <STDIN>;
chomp $answer;
if ($answer) {
$sgml_file = $answer;
}
my $marc2unicode_file = "marc2unicode.txt";
print " MARC-8 to Unicode mapping data file [$marc2unicode_file]: ";
$answer = <STDIN>;
chomp $answer;
if ($answer) {
$marc2unicode_file = $answer;
}
my $unicode_file = "UnicodeData.txt";
print " Unicode data file [$unicode_file]: ";
$answer = <STDIN>;
chomp $answer;
if ($answer) {
$unicode_file = $answer;
}
#my %data_files = ('MARC' => $marc_file,
# 'SGML' => $sgml_file,
# 'Unicode' => $unicode_file);
#
#foreach my $key (keys (%data_files)) {
# print "\n $key data file name [$data_files{$key}]: ";
# $answer = <STDIN>;
# chomp $answer;
# if ($answer) {
# $unicode_file = $answer;
# }
#}
my ($dbh,$sth);
&StartItUp;
sub StartItUp {
&ConnectMySQL;
&DBCheck;
&CreateDB;
&PopMARC8;
&PopMARC;
&PopISO8859_1;
&PopSGML;
&PopMarc2Unicode;
&PopUnicode;
&DisconnectMySQL;
}
############################################################
# ConnectMySQL
############################################################
#
# Connects to the MySQL database
sub ConnectMySQL {
$dbh = DBI->connect('dbi:mysql:', $username, $password)
|| die "Could not connect: $DBI::errstr\n";
}
############################################################
# DisconnectMySQL
############################################################
#
# Exits gracefully from the MySQL database
sub DisconnectMySQL {
$sth->finish;
$dbh->disconnect;
print " Done!\n";
}
############################################################
# DBCheck
############################################################
#
# Checks for existence of the database.
sub DBCheck {
my $sql_string = "show databases";
$sth = $dbh->prepare($sql_string)
|| die $dbh->errstr;
$sth->execute
|| die $dbh->errstr;
while(my ($show_results) = $sth->fetchrow_array() ) {
if ($show_results =~ /$database/) {
my $detected_db = $show_results;
print "\n WARNING: A $show_results database already exists!\n";
print " Drop the existing $detected_db database? [y|n] ";
my $answer = <STDIN>;
if ($answer =~ /y/i) {
$dbh->do("
DROP DATABASE $detected_db
");
print " ...existing $detected_db database dropped\n";
} elsif ($answer =~ /n/i) {
print " Aborting program\n";
&DisconnectMySQL;
exit(1);
} else {
&DisconnectMySQL;
print " Couldn't parse your answer.\n";
print " Exiting from $0\n";
exit(1);
}
}
}
}
############################################################
# CreateDB
############################################################
#
# Creates the database and tables.
sub CreateDB {
$dbh->do("
CREATE DATABASE $database
");
print " ...database created: $database\n";
$dbh->do("
USE $database
");
$dbh->do("
CREATE TABLE marc
(id INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
marc_code VARCHAR(14) NOT NULL,
ucs_code CHAR(4) NOT NULL,
char_name VARCHAR(128)
)
");
print " ...table created: $database.marc\n";
foreach my $key (keys (%marc_8_tables)) {
$dbh->do("
CREATE TABLE $marc_8_tables{$key}
(id INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
code CHAR(2) NOT NULL,
ucs_code CHAR(4) NOT NULL,
char_name VARCHAR(128)
)
");
print " ...table created: $database.$marc_8_tables{$key}\n";
}
$dbh->do("
CREATE TABLE iso8859_1
(latin_code CHAR(2) PRIMARY KEY NOT NULL,
ucs_code CHAR(4) NOT NULL
)
");
print " ...table created: $database.iso8859_1\n";
$dbh->do("
CREATE TABLE sgml
(id INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
char_name VARCHAR(9) NOT NULL,
entity_set VARCHAR(11) NOT NULL,
ucs_code CHAR(4)
)
");
print " ...table created: $database.sgml\n";
$dbh->do("
CREATE TABLE marc2unicode
(id INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
code VARCHAR(6) NOT NULL,
ucs_code CHAR(4) NOT NULL,
utf8_code VARCHAR(6),
name VARCHAR(64),
note VARCHAR(64),
alt_ucs_code VARCHAR(4),
alt_utf8_code VARCHAR(6)
)
");
print " ...table created: $database.marc2unicode\n";
$dbh->do("
CREATE TABLE unicode
(ucs_code VARCHAR(6) PRIMARY KEY NOT NULL,
char_name VARCHAR(128),
gen_cat CHAR(2),
combo_class VARCHAR(3),
bidirec_cat VARCHAR(3),
decomp_map VARCHAR(128),
dec_digit_val VARCHAR(1),
digit_val VARCHAR(1),
num_val VARCHAR(5),
mirrored CHAR(1),
one_zero_name VARCHAR(64),
ucs_comment VARCHAR(64),
ucase_map VARCHAR(5),
lcase_map VARCHAR(5),
tcase_map VARCHAR(5)
)
");
print " ...table created: $database.unicode\n";
}
############################################################
# PopMARC8
############################################################
#
# Populates the MARC_8 tables with data.
sub PopMARC8 {
foreach my $data_file (keys (%marc_8_tables)) {
if ($data_file =~ /skip/i) {
print " ...SKIPPED populating $database.$marc_8_tables{$data_file}\n";
return(1);
}
my $table = "$database.$marc_8_tables{$data_file}";
open (INFILE, "<$data_file")
|| die " WARNING: Cannot open $data_file: $!\n Check filename.";
while (my $line = <INFILE>) {
chomp $line;
if (! ($line =~ /^#/)) {
chomp ($line);
my ($code, $ucs_code, $char_name) = split(/\t/, $line);
my $sql_string = "insert into $table values
(NULL,'$code','$ucs_code','$char_name')";
$sth = $dbh->prepare($sql_string)
|| die $dbh->errstr;
$sth->execute
|| die $dbh->errstr;
}
}
close (INFILE);
print " ...table populated: $table
(with data from $data_file)\n";
}
}
############################################################
# PopMARC
############################################################
#
# Populates the marc table with data.
sub PopMARC {
if ($marc_file =~ /skip/i) {
print " ...SKIPPED populating $database.marc\n";
return(1);
}
my $table = "$database.marc";
open (INFILE, "<$marc_control_file")
|| die "Cannot open input file: $!";
while (my $line = <INFILE>) {
chomp $line;
if ($line && ! ($line =~ /^\#/)) {
my ($ascii_code,$char_name) = split (/\t/, $line);
my $ucs_code = "00$ascii_code";
if ($ascii_code && $ucs_code && $char_name) {
my $sql_string = "insert into $table values
(NULL,'$ascii_code','$ucs_code', '$char_name')";
$sth = $dbh->prepare($sql_string)
|| die $dbh->errstr;
$sth->execute
|| die $dbh->errstr;
}
}
}
close (INFILE);
open (INFILE, "<$iso8859_1_file")
|| die "Cannot open input file: $!";
while (my $line = <INFILE>) {
chomp $line;
if ($line =~ /<a u=/) {
my ($ascii_code,$ucs_code);
if ($line =~ / b=\"(.*?)\"/) {
$ascii_code = $1;
}
if ($line =~ /<a u=\"(.*?)\" /) {
$ucs_code = $1;
}
if (($ascii_code ge "20" && $ascii_code le "7E") && $ucs_code) {
my $sql_string = "insert into $table values
(NULL,'$ascii_code','$ucs_code', NULL)";
$sth = $dbh->prepare($sql_string)
|| die $dbh->errstr;
$sth->execute
|| die $dbh->errstr;
}
}
}
close (INFILE);
open (INFILE, "<$marc_file")
|| die "Cannot open input file: $!";
while (my $line = <INFILE>) {
chomp $line;
if ($line =~ /^<character/) {
my ($marc_code,$ucs_code);
if ($line =~ /hex=\"(.*?)\"/) {
$marc_code = $1;
#$marc_code =~ s/ //g;
} else {
$marc_code = "";
}
if ($line =~ /<unientity>(.*)<\/unientity>/) {
$ucs_code = $1;
} else {
$ucs_code = "";
}
if ($marc_code && $ucs_code) {
my $sql_string = "insert into $table values
(NULL,'$marc_code','$ucs_code','')";
$sth = $dbh->prepare($sql_string)
|| die $dbh->errstr;
$sth->execute
|| die $dbh->errstr;
}
}
}
close (INFILE);
print " ...table populated: $table
(with data from $iso8859_1_file, $marc_control_file, $marc_file)\n";
}
############################################################
# PopISO8859_1
############################################################
#
# Populates the iso8859_1 table with data.
sub PopISO8859_1 {
if ($iso8859_1_file =~ /skip/i) {
print " ...SKIPPED populating $database.iso8859_1\n";
return(1);
}
my $table = "$database.iso8859_1";
open (INFILE, "<$iso8859_1_file")
|| die "Cannot open input file: $!";
while (my $line = <INFILE>) {
chomp $line;
if ($line =~ /<a u=/) {
my ($latin_code,$ucs_code);
if ($line =~ / b=\"(.*?)\"/) {
$latin_code = $1;
}
if ($line =~ /<a u=\"(.*?)\" /) {
$ucs_code = $1;
}
if ($latin_code && $ucs_code) {
my $sql_string = "insert into $table values ('$latin_code','$ucs_code')";
$sth = $dbh->prepare($sql_string)
|| die $dbh->errstr;
$sth->execute
|| die $dbh->errstr;
}
}
}
close (INFILE);
print " ...table populated: $table
(with data from $iso8859_1_file)\n";
}
############################################################
# PopSGML
############################################################
#
# Populates the sgml table with data.
sub PopSGML {
if ($sgml_file =~ /skip/i) {
print " ...SKIPPED populating $database.sgml\n";
return(1);
}
my $table = "$database.sgml";
open (INFILE, "<$sgml_file")
|| die " WARNING: Cannot open $sgml_file: $!\n Check filename.";
while (my $line = <INFILE>) {
chomp $line;
if (! ($line =~ /^#/)) {
chomp ($line);
my ($char_name, $entity_set, $ucs_code) = split(/\t/, $line);
if ($ucs_code =~ /\?/) {
$ucs_code = "";
} else {
$ucs_code =~ s/0x//;
}
my $sql_string = "insert into $table values
(NULL,'$char_name','$entity_set','$ucs_code')";
$sth = $dbh->prepare($sql_string)
|| die $dbh->errstr;
$sth->execute
|| die $dbh->errstr;
}
}
close (INFILE);
print " ...table populated: $table
(with data from $sgml_file)\n";
}
############################################################
# PopMarc2Unicode
############################################################
#
# Populates the sgml table with data.
sub PopMarc2Unicode {
my $table = "$database.marc2unicode";
if ($marc2unicode_file =~ /skip/i) {
print " ...SKIPPED populating $table\n";
return(1);
}
open (INFILE, "<$marc2unicode_file")
|| die " WARNING: Cannot open $marc2unicode_file: $!\n Check filename.";
while (my $line = <INFILE>) {
chomp $line;
if (! ($line =~ /^#/)) {
chomp ($line);
my ($code, $ucs_code, $utf8_code, $name, $note, $alt_ucs_code,
$alt_utf8_code) = split(/\t/, $line);
my $sql_string = "insert into $table values
(NULL,'$code','$ucs_code','$utf8_code','$name','$note',
'$alt_ucs_code','$alt_utf8_code')";
$sth = $dbh->prepare($sql_string)
|| die $dbh->errstr;
$sth->execute
|| die $dbh->errstr;
}
}
close (INFILE);
print " ...table populated: $table
(with data from $marc2unicode_file)\n";
}
############################################################
# PopUnicode
############################################################
#
# Populates the unicode table with data.
sub PopUnicode {
if ($unicode_file =~ /skip/i) {
print " ...SKIPPED populating $database.unicode\n";
return(1);
}
my $table = "$database.unicode";
open (INFILE, "<$unicode_file")
|| die " WARNING: Cannot open $unicode_file: $!\n Check filename.";
while (my $line = <INFILE>) {
chomp $line;
if (! ($line =~ /^#/)) {
chomp ($line);
my ($ucs_code, $char_name, $gen_cat, $combo_class,
$bidirect_cat, $decomp_map, $dec_digit_val,
$digit_val, $num_val, $mirrored, $one_zero_name,
$ucs_comment, $ucase_map, $lcase_map, $tcase_map)
= split(/;/, $line);
my $sql_string = "insert into $table values
('$ucs_code','$char_name','$gen_cat','$combo_class',
'$bidirect_cat','$decomp_map','$dec_digit_val',
'$digit_val','$num_val','$mirrored','$one_zero_name',
'$ucs_comment','$ucase_map','$lcase_map','$tcase_map')";
$sth = $dbh->prepare($sql_string)
|| die $dbh->errstr;
$sth->execute
|| die $dbh->errstr;
}
}
close (INFILE);
print " ...table populated: $table
(with data from $unicode_file)\n";
}
|