########################################################################
#
# MARCtoLatin.pl
#
# Version: 1.1
#
# Created by Michael Doran, doran@uta.edu
#
# University of Texas at Arlington Libraries
# Box 19497, Arlington, TX 76019, USA
#
# This function was originally written for newbooks.pl, a component
# of the New Books List, an unofficial add-on for the Voyager
# integrated library management system. For more information on
# the New Books List, see http://rocky.uta.edu/doran/autolist/.
#
########################################################################
#
# Changes:
#
# 1.1 Per Michael Koehn suggestion, added check for presence of
# high-order bytes before attempting to do any translation
# and simply return line if none found.
#
########################################################################
########################################################################
#
# Using the function
#
# To use this function, select the desired character set below and
# then add this line to your program: require "MARCtoLatin.pl";
#
########################################################################
########################################################################
#
# This function takes a string extracted from a MARC record
# and converts it from its native character set (e.g. MARC-8
# for USMARC records) to Latin-1 (ISO-8859-1).
#
# Note: Be aware that not all MARC8 characters can be converted,
# since there are more combinations available in MARC8 than have
# Latin 1 equivalents. Non-spacing characters that can not be
# converted, are stripped out. Therefore this is not recommended
# for programs/projects that will eventually return the data to
# a library database. The original intention was a one-way
# conversion to make MARC data palatable for display on the web.
#
# Most USA/Great Britain/Australia/New Zealand sites will
# want to select the marc8 option.
#
# Finnish sites will have a choice of marc8-fin or finmarc
# depending on the type of bib records used.
#
# The ASCII option removes all the non-spacing MARC-8
# characters: you won't get any nonsense characters, nor
# will you end up with any diacritics or special characters.
# There may be valid reasons to choose this option related
# to sorting and/or keyword searching.
#
# If your records have been converted to Unicode you probably
# don't want any conversion done.
#
# If your choose to end
# catalog uses this option up with
# ------------- ----------- -------
# MARC 21 records -> "marc8" -> Latin-1
# MARC 21 w/ Finnish -> "marc8-fin" -> Latin-1
# FINMARC records -> "finmarc" -> Latin-1
# (any record type) -> "strip" -> ASCII
# ** Unicode ** -> "passthru" -> No change
#
# Choices are marc8, marc8-fin, finmarc, or strip.
my $character_set = "marc8";
# Within your program, you would pass the function a
# string that you would like to be converted.
# Something like this...
# my $ConvertedAuthor = &CharConv($author);
sub CharConv {
my ($line) = @_;
if ($character_set eq "passthru") {
return($line);
}
# Check for high-order characters ( > hex 7F). If none found,
# no conversion needed. (Added by MK, 2005)
if ($line !~ /[\x80-\xff]/) {
return($line);
}
# Convert input string to hexadecimal
$line =~ s/(.)/sprintf ("%%%X", ord($1))/eg;
if ($character_set eq "finmarc") {
# This subroutine takes data encoded with the FINMARC
# character set (ISO-6937/2) and (as best as can be done)
# maps it to the ISO 8859-1 (Latin-1) character set.
my %iso6937_to_latin1_doubles = (
'%E6' => '%49%4A', '%EF' => '%27%6E', '%F6' => '%69%6A',
);
my %iso6937_to_latin1 = (
'%A4' => '24', '%C8%20' => 'A8', '%D3' => 'A9',
'%E3' => 'AA', '%D2' => 'AE', '%C2%20' => 'B4',
'%CB%20' => 'B8', '%D1' => 'B9', '%EB' => 'BA',
'%C1%41' => 'C0', '%C2%41' => 'C1', '%C3%41' => 'C2',
'%C4%41' => 'C3', '%C8%41' => 'C4', '%CA%41' => 'C5',
'%E1' => 'C6', '%CB%43' => 'C7', '%C1%45' => 'C8',
'%C2%45' => 'C9', '%C3%45' => 'CA', '%C8%45' => 'CB',
'%C1%49' => 'CC', '%C2%49' => 'CD', '%C3%49' => 'CE',
'%C8%49' => 'CF', '%E2' => 'D0', '%C4%4E' => 'D1',
'%C1%4F' => 'D2', '%C2%4F' => 'D3', '%C3%4F' => 'D4',
'%C4%4F' => 'D5', '%C8%4F' => 'D6', '%B4' => 'D7',
'%E9' => 'D8', '%C1%55' => 'D9', '%C2%55' => 'DA',
'%C3%55' => 'DB', '%C8%55' => 'DC', '%C2%59' => 'DD',
'%EC' => 'DE', '%FB' => 'DF', '%C1%61' => 'E0',
'%C2%61' => 'E1', '%C3%61' => 'E2', '%C4%61' => 'E3',
'%C8%61' => 'E4', '%CA%61' => 'E5', '%F1' => 'E6',
'%CB%63' => 'E7', '%C1%65' => 'E8', '%C2%65' => 'E9',
'%C3%65' => 'EA', '%C8%65' => 'EB', '%C1%69' => 'EC',
'%C2%69' => 'ED', '%C3%69' => 'EE', '%C8%69' => 'EF',
'%F3' => 'F0', '%C4%6E' => 'F1', '%C1%6F' => 'F2',
'%C2%6F' => 'F3', '%C3%6F' => 'F4', '%C4%6F' => 'F5',
'%C8%6F' => 'F6', '%B8' => 'F7', '%F9' => 'F8',
'%C1%75' => 'F9', '%C2%75' => 'FA', '%C3%75' => 'FB',
'%C8%75' => 'FC', '%C2%79' => 'FD', '%FC' => 'FE',
'%C8%79' => 'FF', '%A9' => '27', '%AA' => '22',
'%B9' => '27', '%BA' => '22', '%E4' => '48',
'%E7' => '4C', '%E8' => '4C', '%ED' => '54',
'%EE' => '4E', '%F2' => '64', '%F4' => '68',
'%F5' => '69', '%F7' => '6C', '%F8' => '6C',
'%FD' => '74', '%EE' => '6E'
);
my @orphan_chars_iso6937 = (
'A6', 'AC', 'AD', 'AF', 'C1', 'C2', 'C3', 'C4', 'C5',
'C6', 'C7', 'C8', 'CA', 'CB', 'CC', 'CD', 'CE', 'CF',
'D4', 'D5', 'DC', 'DD', 'DE', 'DF', 'E0'
);
foreach my $finmarc_char (keys (%iso6937_to_latin1_doubles)) {
$line =~ s/$finmarc_char/$iso6937_to_latin1_doubles{$finmarc_char}/eg;
}
foreach my $finmarc_char2 (keys (%iso6937_to_latin1)) {
$line =~ s/$finmarc_char2/pack("C", hex($iso6937_to_latin1{$finmarc_char2}))/eg;
}
foreach my $orphan (@orphan_chars_iso6937) {
$line =~ s/$orphan//g;
}
} elsif ($character_set eq "marc8" ||
$character_set eq "marc8-fin") {
# This subroutine takes data encoded with the MARC-8
# character set and (as best as can be done) maps it
# to the ISO 8859-1 (Latin-1) character set.
# First, combined MARC-8 characters that don't
# correspond to any Latin-1 characters are removed.
# Then combined MARC characters that correspond to
# extended Latin-1 characters are replaced with the
# appropriate values. This process is then repeated
# for single values.
# MARC8-fin records get a few additional C1 control
# characters translated into graphic characters.
my @orphan_chars_combined = (
'%1B%67%61%1B%73', '%1B%67%62%1B%73', '%1B%67%63%1B%73',
'%1B%62%30%1B%73', '%1B%62%31%1B%73', '%1B%62%32%1B%73',
'%1B%62%33%1B%73', '%1B%62%34%1B%73', '%1B%62%35%1B%73',
'%1B%62%36%1B%73', '%1B%62%37%1B%73', '%1B%62%38%1B%73',
'%1B%62%39%1B%73', '%1B%62%28%1B%73', '%1B%62%2B%1B%73',
'%1B%62%29%1B%73', '%1B%70%30%1B%73', '%1B%70%31%1B%73',
'%1B%70%32%1B%73', '%1B%70%33%1B%73', '%1B%70%34%1B%73',
'%1B%70%35%1B%73', '%1B%70%36%1B%73', '%1B%70%37%1B%73',
'%1B%70%38%1B%73', '%1B%70%39%1B%73', '%1B%70%28%1B%73',
'%1B%70%2D%1B%73', '%1B%70%2B%1B%73', '%1B%70%29%1B%73',
'%E0%E6', '%E1%E3', '%E1%E5', '%E1%E6', '%E1%E8',
'%E2%A5', '%E2%B5', '%E2%E4', '%E2%E5', '%E2%E6',
'%E2%E8', '%E2%EA', '%E2%F0', '%E3%E0', '%E3%E1',
'%E3%E2', '%E3%F2', '%E4%E3', '%E4%E6', '%E5%E4',
'%E5%E7', '%E5%E8', '%E5%A5', '%E5%B5', '%E5%F1',
'%E5%F2', '%E6%F0', '%E6%F2', '%E7%E2', '%E7%F2',
'%E8%E4', '%E8%E5', '%E9%E8'
);
my @orphan_chars_single = (
'%A1','%A3','%A6','%A7','%A9','%AC','%AD','%AE',
'%AF','%B0','%B1','%B3','%B6','%B7','%B8','%BB',
'%BC','%BD','%BE','%BF','%C1','%C2','%C4','%E0',
'%E5','%E6','%E7','%E9','%EB','%EC','%ED','%EE',
'%EF','%F1','%F2','%F3','%F4','%F5','%F6','%F7',
'%F8','%F9','%FA','%FB','%FC','%FD','%FE'
);
my %marc8_to_latin1_combined = (
'%1B%70%32%1B%73' => 'B2', '%1B%70%33%1B%73' => 'B3',
'%1B%70%31%1B%73' => 'B9', '%E1%41' => 'C0',
'%E2%41' => 'C1', '%E3%41' => 'C2', '%E4%41' => 'C3',
'%E8%41' => 'C4', '%EA%41' => 'C5', '%E2%43' => '43',
'%E3%43' => '43', '%F0%43' => 'C7', '%E1%45' => 'C8',
'%E2%45' => 'C9', '%E3%45' => 'CA', '%E4%45' => '45',
'%E8%45' => 'CB', '%F0%45' => '45', '%E2%47' => '47',
'%E3%47' => '47', '%F0%47' => '47', '%E3%48' => '48',
'%E8%48' => '48', '%F0%48' => '48', '%E1%49' => 'CC',
'%E2%49' => 'CD', '%E3%49' => 'CE', '%E4%49' => '49',
'%E8%49' => 'CF', '%E3%4A' => '4A', '%E2%4B' => '4B',
'%E3%4B' => '4B', '%F0%4B' => '4B', '%F2%4B' => '4B',
'%E2%4C' => '4C', '%E3%4C' => '4C', '%F0%4C' => '4C',
'%E2%4D' => '4D', '%E1%4E' => '4E', '%E2%4E' => '4E',
'%E4%4E' => 'D1', '%F0%4E' => '4E', '%E1%4F' => 'D2',
'%E2%4F' => 'D3', '%E3%4F' => 'D4', '%E4%4F' => 'D5',
'%E8%4F' => 'D6', '%E2%50' => '50', '%E2%52' => '52',
'%E2%53' => '53', '%E3%53' => '53', '%F0%53' => '53',
'%F0%54' => '54', '%E1%55' => 'D9', '%E2%55' => 'DA',
'%E3%55' => 'DB', '%E4%55' => '55', '%E8%55' => 'DC',
'%EA%55' => '55', '%E4%56' => '56', '%E1%57' => '57',
'%E2%57' => '57', '%E3%57' => '57', '%E8%57' => '57',
'%E8%58' => '58', '%E1%59' => '59', '%E2%59' => 'DD',
'%E3%59' => '59', '%E4%59' => '59', '%E8%59' => '59',
'%E2%5A' => '5A', '%E3%5A' => '5A', '%E1%61' => 'E0',
'%E2%61' => 'E1', '%E3%61' => 'E2', '%E4%61' => 'E3',
'%E8%61' => 'E4', '%EA%61' => 'E5', '%E2%63' => '63',
'%E3%63' => '63', '%F0%63' => 'E7', '%E1%65' => 'E8',
'%E2%65' => 'E9', '%E3%65' => 'EA', '%E4%65' => '65',
'%E8%65' => 'EB', '%F0%65' => '65', '%E2%67' => '67',
'%E3%67' => '67', '%F0%67' => '67', '%E3%68' => '68',
'%E8%68' => '68', '%F0%68' => '68', '%E1%69' => 'EC',
'%E2%69' => 'ED', '%E3%69' => 'EE', '%E4%69' => '69',
'%E8%69' => 'EF', '%E3%6A' => '6A', '%E2%6B' => '6B',
'%E3%6B' => '6B', '%F0%6B' => '6B', '%F2%6B' => '6B',
'%E2%6C' => '6C', '%E3%6C' => '6C', '%F0%6C' => '6C',
'%E2%6D' => '6D', '%E1%6E' => '6E', '%E2%6E' => '6E',
'%E4%6E' => 'F1', '%F0%6E' => '6E', '%E1%6F' => 'F2',
'%E2%6F' => 'F3', '%E3%6F' => 'F4', '%E4%6F' => 'F5',
'%E8%6F' => 'F6', '%E2%70' => '70', '%E2%72' => '72',
'%E2%73' => '73', '%E3%73' => '73', '%F0%73' => '73',
'%E8%74' => '74', '%F0%74' => '74', '%E1%75' => 'F9',
'%E2%75' => 'FA', '%E3%75' => 'FB', '%E4%75' => '75',
'%E8%75' => 'FC', '%EA%75' => '75', '%E4%76' => '76',
'%E1%77' => '77', '%E2%77' => '77', '%E3%77' => '77',
'%E8%77' => '77', '%EA%77' => '77', '%E8%78' => '78',
'%E1%79' => '79', '%E2%79' => 'FD', '%E3%79' => '79',
'%E4%79' => '79', '%E8%79' => '79', '%EA%79' => '79',
'%E8%79' => 'FF', '%E2%7A' => '7A', '%E3%7A' => '7A',
'%E2%A2' => '4F', '%E1%AC' => '4F', '%E2%AC' => '4F',
'%E4%AC' => '4F', '%E1%AD' => '55', '%E2%AD' => '55',
'%E4%AD' => '55', '%E2%B2' => '6F', '%E1%BC' => '6F',
'%E2%BC' => '6F', '%E4%BC' => '6F', '%E1%BD' => '75',
'%E2%BD' => '75', '%E4%BD' => '75'
);
my %marc8_to_latin1_single = (
'%A2' => 'D8', '%A4' => 'DE', '%A5' => 'C6',
'%A8' => 'B7', '%AA' => 'AE', '%AB' => 'B1',
'%B2' => 'F8', '%B4' => 'FE', '%B5' => 'E6',
'%B9' => 'A3', '%BA' => 'F0', '%C0' => 'B0',
'%C3' => 'A9', '%C5' => 'BF', '%C6' => 'A1'
);
my %marc8_fin_to_latin1_single = (
'%83' => 'C5', '%84' => 'C4', '%85' => 'D6',
'%86' => 'E5', '%87' => 'E4', '%88' => 'F6'
);
foreach my $char1 (@orphan_chars_combined) {
$line =~ s/$char1//g;
}
foreach my $marc_char1 (keys (%marc8_to_latin1_combined)) {
$line =~ s/$marc_char1/pack("C", hex($marc8_to_latin1_combined{$marc_char1}))/eg;
}
foreach my $char2 (@orphan_chars_single) {
$line =~ s/$char2//g;
}
foreach my $marc_char2 (keys (%marc8_to_latin1_single)) {
$line =~ s/$marc_char2/pack("C", hex($marc8_to_latin1_single{$marc_char2}))/eg;
}
if ($character_set eq "marc8-fin") {
foreach my $marc_char3 (keys (%marc8_fin_to_latin1_single)) {
$line =~ s/$marc_char3/pack("C", hex($marc8_fin_to_latin1_single{$marc_char3}))/eg;
}
}
} elsif ($character_set eq "strip") {
$line =~ s/%[A-F8-9][A-F0-9]//g;
}
$line =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/eg;
return($line);
}
return 1;
|