Michael Doran Home Page
Contact | Site Map | Search  
  Home > Archives > Coded Character Sets > URDU > About > Database Create Script
  Menu

URDU

Unicode Relational Database Utility
- beta -
  Character Map Charts

Database Create Script

#!/usr/local/bin/perl -w

########################################################################
#
#  cnp-urdu.pl : "Create 'n' populate URDU"
#
#  An URDU database program
#
#  Version: 3.0
#
#  2002-2003, Michael Doran, doran@uta.edu
#
#  University of Texas at Arlington Libraries
#  Box 19497, Arlington, TX 76019, USA
#
#  This program creates a MySQL database called 'urdu' and
#  populates it with data derived from character set files
#  available from the world wide web.
#
########################################################################

use strict;

use DBI;

my $database = "urdu";

&IntroSpiel;

############################################################
#  IntroSpiel
############################################################

sub IntroSpiel {
    print <<EOSTUFF

  This program creates a MySQL database called '$database'
  and populates the tables with data from external files.

EOSTUFF
}


############################################################
#  Gather MySQL user information
############################################################

print "  Enter your MySQL username: ";
  my $username = <STDIN>;
  chomp $username;
print "  Enter your MySQL password: ";
  my $password = <STDIN>;
  chomp $password;

############################################################
#  Gather information on data files
############################################################

my $answer;

my %marc_8_tables = (
	"marc-8-latin-basic.txt"    => "marc_8_latin_basic",
	"marc-8-latin-ext.txt"      => "marc_8_latin_ext",
	"marc-8-arabic-basic.txt"   => "marc_8_arabic_basic",
	"marc-8-arabic-ext.txt"     => "marc_8_arabic_ext",
	"marc-8-cyrillic-basic.txt" => "marc_8_cyrillic_basic",
	"marc-8-cyrillic-ext.txt"   => "marc_8_cyrillic_ext",
	"marc-8-greek-basic.txt"    => "marc_8_greek_basic",
	"marc-8-hebrew-basic.txt"   => "marc_8_hebrew_basic",
	"marc-8-controls.txt"       => "marc_8_controls"
);

my $marc_file     = "charconv.sgm";
  print "  MARC data file [$marc_file]: ";
  $answer = <STDIN>;
  chomp $answer;
  if ($answer) {
      $marc_file = $answer;
  }

my $marc_control_file     = "marc-control.txt";
  print "  MARC control functions data file [$marc_control_file]: ";
  $answer = <STDIN>;
  chomp $answer;
  if ($answer) {
      $marc_control_file = $answer;
  }

my $iso8859_1_file = "aix-ISO8859_1-4.3.6.xml";
  print "  ISO8859_1 data file [$iso8859_1_file]: ";
  $answer = <STDIN>;
  chomp $answer;
  if ($answer) {
      $iso8859_1_file = $answer;
  }

my $sgml_file     = "SGML.TXT";
  print "  SGML data file [$sgml_file]: ";
  $answer = <STDIN>;
  chomp $answer;
  if ($answer) {
      $sgml_file = $answer;
  }

my $marc2unicode_file  = "marc2unicode.txt";
  print "  MARC-8 to Unicode mapping data file [$marc2unicode_file]: ";
  $answer = <STDIN>;
  chomp $answer;
  if ($answer) {
      $marc2unicode_file = $answer;
  }


my $unicode_file  = "UnicodeData.txt";
  print "  Unicode data file [$unicode_file]: ";
  $answer = <STDIN>;
  chomp $answer;
  if ($answer) {
      $unicode_file = $answer;
  }

#my %data_files = ('MARC' => $marc_file,
#		  'SGML'         => $sgml_file,
#		  'Unicode'      => $unicode_file);
#
#foreach my $key (keys (%data_files)) {
#    print "\n  $key data file name [$data_files{$key}]: ";
#    $answer = <STDIN>;
#    chomp $answer;
#    if ($answer) {
#	$unicode_file = $answer;
#    }
#}

my ($dbh,$sth);

&StartItUp;

sub StartItUp {
    &ConnectMySQL;
    &DBCheck;
    &CreateDB;
    &PopMARC8;
    &PopMARC;
    &PopISO8859_1;
    &PopSGML;
    &PopMarc2Unicode;
    &PopUnicode;
    &DisconnectMySQL;
}

############################################################
#  ConnectMySQL
############################################################
#
#  Connects to the MySQL database

sub ConnectMySQL {
    $dbh = DBI->connect('dbi:mysql:', $username, $password)
	|| die "Could not connect: $DBI::errstr\n";
}


############################################################
#  DisconnectMySQL
############################################################
#
#  Exits gracefully from the MySQL database

sub DisconnectMySQL {
    $sth->finish;
    $dbh->disconnect;
    print "  Done!\n";
}


############################################################
#  DBCheck
############################################################
#
#  Checks for existence of the database.

sub DBCheck {
    my $sql_string = "show databases";

    $sth = $dbh->prepare($sql_string)
	|| die $dbh->errstr;

    $sth->execute
	|| die $dbh->errstr;

    while(my ($show_results) = $sth->fetchrow_array() ) {
	if ($show_results =~ /$database/) {
	    my $detected_db = $show_results;
	    print "\n  WARNING: A $show_results database already exists!\n";
	    print "  Drop the existing $detected_db database? [y|n] ";
	    my $answer = <STDIN>;
	    if ($answer =~ /y/i) {
		$dbh->do("
		    DROP DATABASE $detected_db
		");
		print "  ...existing $detected_db database dropped\n";
            } elsif ($answer =~ /n/i) {
		print "  Aborting program\n";
		&DisconnectMySQL;
		exit(1);
	    } else {
		&DisconnectMySQL;
		print "  Couldn't parse your answer.\n";
		print "  Exiting from $0\n";
		exit(1);
	    }
        }
    }
}


############################################################
#  CreateDB
############################################################
#
#  Creates the database and tables.

sub CreateDB {
    $dbh->do("
	CREATE DATABASE $database
	");
    print "  ...database created: $database\n";
    $dbh->do("
	USE $database
	");
    $dbh->do("
	CREATE TABLE marc	
		(id		INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
		 marc_code	VARCHAR(14) NOT NULL,
		 ucs_code	CHAR(4) NOT NULL,
		 char_name	VARCHAR(128)
		)
	");
    print "  ...table created: $database.marc\n";

    foreach my $key (keys (%marc_8_tables)) {
        $dbh->do("
	    CREATE TABLE $marc_8_tables{$key}	
		(id		INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
		 code		CHAR(2) NOT NULL,
		 ucs_code	CHAR(4) NOT NULL,
		 char_name	VARCHAR(128)
		)
	    ");
        print "  ...table created: $database.$marc_8_tables{$key}\n";
    }

    $dbh->do("
	CREATE TABLE iso8859_1	
		(latin_code	CHAR(2) PRIMARY KEY NOT NULL,
		 ucs_code	CHAR(4) NOT NULL
		)
	");
    print "  ...table created: $database.iso8859_1\n";
    $dbh->do("
	CREATE TABLE sgml	
		(id		INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
		 char_name	VARCHAR(9) NOT NULL,
		 entity_set	VARCHAR(11) NOT NULL,
		 ucs_code	CHAR(4)
		)
	");
    print "  ...table created: $database.sgml\n";
    $dbh->do("
	CREATE TABLE marc2unicode	
		(id		INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
		 code		VARCHAR(6) NOT NULL,
		 ucs_code	CHAR(4) NOT NULL,
		 utf8_code	VARCHAR(6),
		 name		VARCHAR(64),
		 note		VARCHAR(64),
		 alt_ucs_code	VARCHAR(4),
		 alt_utf8_code	VARCHAR(6)
		)
	");
    print "  ...table created: $database.marc2unicode\n";
    $dbh->do("
	CREATE TABLE unicode	
		(ucs_code	VARCHAR(6) PRIMARY KEY NOT NULL,
		 char_name	VARCHAR(128),
		 gen_cat	CHAR(2),
		 combo_class	VARCHAR(3),
		 bidirec_cat	VARCHAR(3),
		 decomp_map	VARCHAR(128),
		 dec_digit_val	VARCHAR(1),
		 digit_val	VARCHAR(1),
		 num_val	VARCHAR(5),
		 mirrored	CHAR(1),
		 one_zero_name	VARCHAR(64),
		 ucs_comment	VARCHAR(64),
		 ucase_map	VARCHAR(5),
		 lcase_map	VARCHAR(5),
		 tcase_map	VARCHAR(5)
		)
	");
    print "  ...table created: $database.unicode\n";
}


############################################################
#  PopMARC8
############################################################
#
#  Populates the MARC_8 tables with data.

sub PopMARC8 {
  foreach my $data_file (keys (%marc_8_tables)) {

    if ($data_file =~ /skip/i) {
	print "  ...SKIPPED populating $database.$marc_8_tables{$data_file}\n";
	return(1);
    }
    my $table = "$database.$marc_8_tables{$data_file}";
    open (INFILE, "<$data_file")
	|| die "  WARNING: Cannot open $data_file: $!\n  Check filename.";
    while (my $line = <INFILE>) {
	chomp $line;
	if (! ($line =~ /^#/)) {
	chomp ($line);
            my ($code, $ucs_code, $char_name) = split(/\t/, $line);
	    my $sql_string = "insert into $table values
		(NULL,'$code','$ucs_code','$char_name')";

	    $sth = $dbh->prepare($sql_string)
		|| die $dbh->errstr;

	    $sth->execute
		|| die $dbh->errstr;
        }
    }
    close (INFILE);
    print "  ...table populated: $table
	(with data from $data_file)\n";
  }
}


############################################################
#  PopMARC
############################################################
#
#  Populates the marc table with data.

sub PopMARC {
    if ($marc_file =~ /skip/i) {
	print "  ...SKIPPED populating $database.marc\n";
	return(1);
    }
    my $table = "$database.marc";
    open (INFILE, "<$marc_control_file")
	|| die "Cannot open input file: $!";
    while (my $line = <INFILE>) {
	chomp $line;
        if ($line && ! ($line =~ /^\#/)) {
            my ($ascii_code,$char_name) = split (/\t/, $line);
            my $ucs_code = "00$ascii_code";
            if ($ascii_code && $ucs_code && $char_name) {
	        my $sql_string = "insert into $table values 
		(NULL,'$ascii_code','$ucs_code', '$char_name')";

	        $sth = $dbh->prepare($sql_string)
		    || die $dbh->errstr;

	        $sth->execute
		    || die $dbh->errstr;
            }
        }
    }
    close (INFILE);
    open (INFILE, "<$iso8859_1_file")
	|| die "Cannot open input file: $!";
    while (my $line = <INFILE>) {
	chomp $line;
        if ($line =~ /<a u=/) {
            my ($ascii_code,$ucs_code);
            if ($line =~ / b=\"(.*?)\"/) {
                $ascii_code = $1;
            }
            if ($line =~ /<a u=\"(.*?)\" /) {
                $ucs_code = $1;
            }
            if (($ascii_code ge "20" && $ascii_code le "7E") && $ucs_code) {
	        my $sql_string = "insert into $table values 
		(NULL,'$ascii_code','$ucs_code', NULL)";

	        $sth = $dbh->prepare($sql_string)
		    || die $dbh->errstr;

	        $sth->execute
		    || die $dbh->errstr;
            }
        }
    }
    close (INFILE);
    open (INFILE, "<$marc_file")
	|| die "Cannot open input file: $!";
    while (my $line = <INFILE>) {
	chomp $line;
        if ($line =~ /^<character/) {
            my ($marc_code,$ucs_code);
            if ($line =~ /hex=\"(.*?)\"/) {
                $marc_code = $1;
                #$marc_code =~ s/ //g;
            } else {
                $marc_code = "";
            }
            if ($line =~ /<unientity>(.*)<\/unientity>/) {
                $ucs_code = $1;
            } else {
                $ucs_code = "";
            }
            if ($marc_code && $ucs_code) {
	        my $sql_string = "insert into $table values 
		(NULL,'$marc_code','$ucs_code','')";

	        $sth = $dbh->prepare($sql_string)
		    || die $dbh->errstr;

	        $sth->execute
		    || die $dbh->errstr;
            }
        }
    }

    close (INFILE);
    print "  ...table populated: $table
	(with data from $iso8859_1_file, $marc_control_file, $marc_file)\n";
}


############################################################
#  PopISO8859_1
############################################################
#
#  Populates the iso8859_1 table with data.

sub PopISO8859_1 {
    if ($iso8859_1_file =~ /skip/i) {
	print "  ...SKIPPED populating $database.iso8859_1\n";
	return(1);
    }
    my $table = "$database.iso8859_1";
    open (INFILE, "<$iso8859_1_file")
	|| die "Cannot open input file: $!";

    while (my $line = <INFILE>) {
	chomp $line;
        if ($line =~ /<a u=/) {
            my ($latin_code,$ucs_code);
            if ($line =~ / b=\"(.*?)\"/) {
                $latin_code = $1;
            }
            if ($line =~ /<a u=\"(.*?)\" /) {
                $ucs_code = $1;
            }
            if ($latin_code && $ucs_code) {
	        my $sql_string = "insert into $table values ('$latin_code','$ucs_code')";

	        $sth = $dbh->prepare($sql_string)
		    || die $dbh->errstr;

	        $sth->execute
		    || die $dbh->errstr;
            }
        }
    }
    close (INFILE);
    print "  ...table populated: $table
	(with data from $iso8859_1_file)\n";
}


############################################################
#  PopSGML
############################################################
#
#  Populates the sgml table with data.

sub PopSGML {
    if ($sgml_file =~ /skip/i) {
	print "  ...SKIPPED populating $database.sgml\n";
	return(1);
    }
    my $table = "$database.sgml";
    open (INFILE, "<$sgml_file")
	|| die "  WARNING: Cannot open $sgml_file: $!\n  Check filename.";
    while (my $line = <INFILE>) {
	chomp $line;
	if (! ($line =~ /^#/)) {
	chomp ($line);
            my ($char_name, $entity_set, $ucs_code) = split(/\t/, $line);
	    if ($ucs_code =~ /\?/) {
		$ucs_code = "";
	    } else {
	        $ucs_code =~ s/0x//;
	    }
	    my $sql_string = "insert into $table values 
		(NULL,'$char_name','$entity_set','$ucs_code')";

	    $sth = $dbh->prepare($sql_string)
		|| die $dbh->errstr;

	    $sth->execute
		|| die $dbh->errstr;
        }
    }
    close (INFILE);
    print "  ...table populated: $table
	(with data from $sgml_file)\n";
}


############################################################
#  PopMarc2Unicode
############################################################
#
#  Populates the sgml table with data.

sub PopMarc2Unicode {
    my $table = "$database.marc2unicode";
    if ($marc2unicode_file =~ /skip/i) {
	print "  ...SKIPPED populating $table\n";
	return(1);
    }
    open (INFILE, "<$marc2unicode_file")
	|| die "  WARNING: Cannot open $marc2unicode_file: $!\n  Check filename.";
    while (my $line = <INFILE>) {
	chomp $line;
	if (! ($line =~ /^#/)) {
	chomp ($line);
            my ($code, $ucs_code, $utf8_code, $name, $note, $alt_ucs_code, 
		$alt_utf8_code) = split(/\t/, $line);
	    my $sql_string = "insert into $table values 
		(NULL,'$code','$ucs_code','$utf8_code','$name','$note',
		'$alt_ucs_code','$alt_utf8_code')";

	    $sth = $dbh->prepare($sql_string)
		|| die $dbh->errstr;

	    $sth->execute
		|| die $dbh->errstr;
        }
    }
    close (INFILE);
    print "  ...table populated: $table
	(with data from $marc2unicode_file)\n";
}


############################################################
#  PopUnicode
############################################################
#
#  Populates the unicode table with data.

sub PopUnicode {
    if ($unicode_file =~ /skip/i) {
	print "  ...SKIPPED populating $database.unicode\n";
	return(1);
    }
    my $table = "$database.unicode";
    open (INFILE, "<$unicode_file")
	|| die "  WARNING: Cannot open $unicode_file: $!\n  Check filename.";
    while (my $line = <INFILE>) {
	chomp $line;
	if (! ($line =~ /^#/)) {
	chomp ($line);
            my ($ucs_code, $char_name, $gen_cat, $combo_class,
	        $bidirect_cat, $decomp_map, $dec_digit_val,
		$digit_val, $num_val, $mirrored, $one_zero_name,
		$ucs_comment, $ucase_map, $lcase_map, $tcase_map) 
		= split(/;/, $line);
	    my $sql_string = "insert into $table values 
               ('$ucs_code','$char_name','$gen_cat','$combo_class',
	        '$bidirect_cat','$decomp_map','$dec_digit_val',
		'$digit_val','$num_val','$mirrored','$one_zero_name',
		'$ucs_comment','$ucase_map','$lcase_map','$tcase_map')";

	    $sth = $dbh->prepare($sql_string)
		|| die $dbh->errstr;

	    $sth->execute
		|| die $dbh->errstr;
        }
    }
    close (INFILE);
    print "  ...table populated: $table
	(with data from $unicode_file)\n";
}