######################################################################## # # MARCtoLatin.pl # # Version: 1.1 # # Created by Michael Doran, doran@uta.edu # # University of Texas at Arlington Libraries # Box 19497, Arlington, TX 76019, USA # # This function was originally written for newbooks.pl, a component # of the New Books List, an unofficial add-on for the Voyager # integrated library management system. For more information on # the New Books List, see http://rocky.uta.edu/doran/autolist/. # ######################################################################## # # Copyright 2002, The University of Texas at Arlington ("UTA"). # All rights reserved. # # By using this software the USER indicates that he or she # has read, understood and and will comply with the following: # # UTA hereby grants USER permission to use, copy, modify, and # distribute this software and its documentation for any # purpose and without fee, provided that: # # 1. the above copyright notice appears in all copies of the # software and its documentation, or portions thereof, and # # 2. a full copy of this notice is included with the software # and its documentation, or portions thereof, and # # 3. neither the software nor its documentation, nor portions # thereof, is sold for profit. Any commercial sale or license # of this software, copies of the software, its associated # documentation and/or modifications of either is strictly # prohibited without the prior consent of UTA. # # Title to copyright to this software and its associated # documentation shall at all times remain with UTA. No right # is granted to use in advertising, publicity or otherwise any # trademark, service mark, or the name of UTA. # # This software and any associated documentation are provided # "as is," and UTA MAKES NO REPRESENTATIONS OR WARRANTIES, # EXPRESSED OR IMPLIED, INCLUDING THOSE OF MERCHANTABILITY OR # FITNESS FOR A PARTICULAR PURPOSE, OR THAT USE OF THE SOFTWARE, # MODIFICATIONS, OR ASSOCIATED DOCUMENTATION WILL NOT INFRINGE # ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER INTELLECTUAL # PROPERTY RIGHTS OF A THIRD PARTY. UTA, The University of Texas # System, its Regents, officers, and employees shall not be # liable under any circumstances for any direct, indirect, special, # incidental, or consequential damages with respect to any claim # by USER or any third party on account of or arising from the # use, or inability to use, this software or its associated # documentation, even if UTA has been advised of the possibility # of those damages. # # Submit commercialization requests to: The University of Texas # at Arlington, Office of Grant and Contract Services, 701 South # Nedderman Drive, Box 19145, Arlington, Texas 76019-0145, # ATTN: Director of Technology Transfer. # ######################################################################## # # Changes: # # 1.1 Per Michael Koehn suggestion, added check for presence of # high-order bytes before attempting to do any translation # and simply return line if none found. # ######################################################################## ######################################################################## # # Using the function # # To use this function, select the desired character set below and # then add this line to your program: require "MARCtoLatin.pl"; # ######################################################################## ######################################################################## # # This function takes a string extracted from a MARC record # and converts it from its native character set (e.g. MARC-8 # for USMARC records) to Latin-1 (ISO-8859-1). Latin-1 is # the base character set for Unicode. # # Note: Be aware that not all MARC8 characters can be converted, # since there are more combinations available in MARC8 than have # Latin 1 equivalents. Non-spacing characters that can not be # converted, are stripped out. Therefore this is not recommended # for programs/projects that will eventually return the data to # a library database. The original intention was a one-way # conversion to make MARC data palatable for display on the web. # # Most USA/Great Britain/Australia/New Zealand sites will # want to select the marc8 option. # # Finnish sites will have a choice of marc8-fin or finmarc # depending on the type of bib records used. # # The ASCII option removes all the non-spacing MARC-8 # characters: you won't get any nonsense characters, nor # will you end up with any diacritics or special characters. # There may be valid reasons to choose this option related # to sorting and/or keyword searching. # # If your records have been converted to Unicode you probably # don't want any conversion done. # # If your choose to end # catalog uses this option up with # ------------- ----------- ------- # MARC 21 records -> "marc8" -> Latin-1 # MARC 21 w/ Finnish -> "marc8-fin" -> Latin-1 # FINMARC records -> "finmarc" -> Latin-1 # (any record type) -> "strip" -> ASCII # ** Unicode ** -> "passthru" -> No change # # Choices are marc8, marc8-fin, finmarc, or strip. my $character_set = "marc8"; # Within your program, you would pass the function a # string that you would like to be converted. # Something like this... # my $ConvertedAuthor = &CharConv($author); sub CharConv { my ($line) = @_; if ($character_set eq "passthru") { return($line); } # Check for high-order characters ( > hex 7F). If none found, # no conversion needed. (Added by MK, 2005) if ($line !~ /[\x80-\xff]/) { return($line); } # Convert input string to hexadecimal $line =~ s/(.)/sprintf ("%%%X", ord($1))/eg; if ($character_set eq "finmarc") { # This subroutine takes data encoded with the FINMARC # character set (ISO-6937/2) and (as best as can be done) # maps it to the ISO 8859-1 (Latin-1) character set. my %iso6937_to_latin1_doubles = ( '%E6' => '%49%4A', '%EF' => '%27%6E', '%F6' => '%69%6A', ); my %iso6937_to_latin1 = ( '%A4' => '24', '%C8%20' => 'A8', '%D3' => 'A9', '%E3' => 'AA', '%D2' => 'AE', '%C2%20' => 'B4', '%CB%20' => 'B8', '%D1' => 'B9', '%EB' => 'BA', '%C1%41' => 'C0', '%C2%41' => 'C1', '%C3%41' => 'C2', '%C4%41' => 'C3', '%C8%41' => 'C4', '%CA%41' => 'C5', '%E1' => 'C6', '%CB%43' => 'C7', '%C1%45' => 'C8', '%C2%45' => 'C9', '%C3%45' => 'CA', '%C8%45' => 'CB', '%C1%49' => 'CC', '%C2%49' => 'CD', '%C3%49' => 'CE', '%C8%49' => 'CF', '%E2' => 'D0', '%C4%4E' => 'D1', '%C1%4F' => 'D2', '%C2%4F' => 'D3', '%C3%4F' => 'D4', '%C4%4F' => 'D5', '%C8%4F' => 'D6', '%B4' => 'D7', '%E9' => 'D8', '%C1%55' => 'D9', '%C2%55' => 'DA', '%C3%55' => 'DB', '%C8%55' => 'DC', '%C2%59' => 'DD', '%EC' => 'DE', '%FB' => 'DF', '%C1%61' => 'E0', '%C2%61' => 'E1', '%C3%61' => 'E2', '%C4%61' => 'E3', '%C8%61' => 'E4', '%CA%61' => 'E5', '%F1' => 'E6', '%CB%63' => 'E7', '%C1%65' => 'E8', '%C2%65' => 'E9', '%C3%65' => 'EA', '%C8%65' => 'EB', '%C1%69' => 'EC', '%C2%69' => 'ED', '%C3%69' => 'EE', '%C8%69' => 'EF', '%F3' => 'F0', '%C4%6E' => 'F1', '%C1%6F' => 'F2', '%C2%6F' => 'F3', '%C3%6F' => 'F4', '%C4%6F' => 'F5', '%C8%6F' => 'F6', '%B8' => 'F7', '%F9' => 'F8', '%C1%75' => 'F9', '%C2%75' => 'FA', '%C3%75' => 'FB', '%C8%75' => 'FC', '%C2%79' => 'FD', '%FC' => 'FE', '%C8%79' => 'FF', '%A9' => '27', '%AA' => '22', '%B9' => '27', '%BA' => '22', '%E4' => '48', '%E7' => '4C', '%E8' => '4C', '%ED' => '54', '%EE' => '4E', '%F2' => '64', '%F4' => '68', '%F5' => '69', '%F7' => '6C', '%F8' => '6C', '%FD' => '74', '%EE' => '6E' ); my @orphan_chars_iso6937 = ( 'A6', 'AC', 'AD', 'AF', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'CA', 'CB', 'CC', 'CD', 'CE', 'CF', 'D4', 'D5', 'DC', 'DD', 'DE', 'DF', 'E0' ); foreach my $finmarc_char (keys (%iso6937_to_latin1_doubles)) { $line =~ s/$finmarc_char/$iso6937_to_latin1_doubles{$finmarc_char}/eg; } foreach my $finmarc_char2 (keys (%iso6937_to_latin1)) { $line =~ s/$finmarc_char2/pack("C", hex($iso6937_to_latin1{$finmarc_char2}))/eg; } foreach my $orphan (@orphan_chars_iso6937) { $line =~ s/$orphan//g; } } elsif ($character_set eq "marc8" || $character_set eq "marc8-fin") { # This subroutine takes data encoded with the MARC-8 # character set and (as best as can be done) maps it # to the ISO 8859-1 (Latin-1) character set. # First, combined MARC-8 characters that don't # correspond to any Latin-1 characters are removed. # Then combined MARC characters that correspond to # extended Latin-1 characters are replaced with the # appropriate values. This process is then repeated # for single values. # MARC8-fin records get a few additional C1 control # characters translated into graphic characters. my @orphan_chars_combined = ( '%1B%67%61%1B%73', '%1B%67%62%1B%73', '%1B%67%63%1B%73', '%1B%62%30%1B%73', '%1B%62%31%1B%73', '%1B%62%32%1B%73', '%1B%62%33%1B%73', '%1B%62%34%1B%73', '%1B%62%35%1B%73', '%1B%62%36%1B%73', '%1B%62%37%1B%73', '%1B%62%38%1B%73', '%1B%62%39%1B%73', '%1B%62%28%1B%73', '%1B%62%2B%1B%73', '%1B%62%29%1B%73', '%1B%70%30%1B%73', '%1B%70%31%1B%73', '%1B%70%32%1B%73', '%1B%70%33%1B%73', '%1B%70%34%1B%73', '%1B%70%35%1B%73', '%1B%70%36%1B%73', '%1B%70%37%1B%73', '%1B%70%38%1B%73', '%1B%70%39%1B%73', '%1B%70%28%1B%73', '%1B%70%2D%1B%73', '%1B%70%2B%1B%73', '%1B%70%29%1B%73', '%E0%E6', '%E1%E3', '%E1%E5', '%E1%E6', '%E1%E8', '%E2%A5', '%E2%B5', '%E2%E4', '%E2%E5', '%E2%E6', '%E2%E8', '%E2%EA', '%E2%F0', '%E3%E0', '%E3%E1', '%E3%E2', '%E3%F2', '%E4%E3', '%E4%E6', '%E5%E4', '%E5%E7', '%E5%E8', '%E5%A5', '%E5%B5', '%E5%F1', '%E5%F2', '%E6%F0', '%E6%F2', '%E7%E2', '%E7%F2', '%E8%E4', '%E8%E5', '%E9%E8' ); my @orphan_chars_single = ( '%A1','%A3','%A6','%A7','%A9','%AC','%AD','%AE', '%AF','%B0','%B1','%B3','%B6','%B7','%B8','%BB', '%BC','%BD','%BE','%BF','%C1','%C2','%C4','%E0', '%E5','%E6','%E7','%E9','%EB','%EC','%ED','%EE', '%EF','%F1','%F2','%F3','%F4','%F5','%F6','%F7', '%F8','%F9','%FA','%FB','%FC','%FD','%FE' ); my %marc8_to_latin1_combined = ( '%1B%70%32%1B%73' => 'B2', '%1B%70%33%1B%73' => 'B3', '%1B%70%31%1B%73' => 'B9', '%E1%41' => 'C0', '%E2%41' => 'C1', '%E3%41' => 'C2', '%E4%41' => 'C3', '%E8%41' => 'C4', '%EA%41' => 'C5', '%E2%43' => '43', '%E3%43' => '43', '%F0%43' => 'C7', '%E1%45' => 'C8', '%E2%45' => 'C9', '%E3%45' => 'CA', '%E4%45' => '45', '%E8%45' => 'CB', '%F0%45' => '45', '%E2%47' => '47', '%E3%47' => '47', '%F0%47' => '47', '%E3%48' => '48', '%E8%48' => '48', '%F0%48' => '48', '%E1%49' => 'CC', '%E2%49' => 'CD', '%E3%49' => 'CE', '%E4%49' => '49', '%E8%49' => 'CF', '%E3%4A' => '4A', '%E2%4B' => '4B', '%E3%4B' => '4B', '%F0%4B' => '4B', '%F2%4B' => '4B', '%E2%4C' => '4C', '%E3%4C' => '4C', '%F0%4C' => '4C', '%E2%4D' => '4D', '%E1%4E' => '4E', '%E2%4E' => '4E', '%E4%4E' => 'D1', '%F0%4E' => '4E', '%E1%4F' => 'D2', '%E2%4F' => 'D3', '%E3%4F' => 'D4', '%E4%4F' => 'D5', '%E8%4F' => 'D6', '%E2%50' => '50', '%E2%52' => '52', '%E2%53' => '53', '%E3%53' => '53', '%F0%53' => '53', '%F0%54' => '54', '%E1%55' => 'D9', '%E2%55' => 'DA', '%E3%55' => 'DB', '%E4%55' => '55', '%E8%55' => 'DC', '%EA%55' => '55', '%E4%56' => '56', '%E1%57' => '57', '%E2%57' => '57', '%E3%57' => '57', '%E8%57' => '57', '%E8%58' => '58', '%E1%59' => '59', '%E2%59' => 'DD', '%E3%59' => '59', '%E4%59' => '59', '%E8%59' => '59', '%E2%5A' => '5A', '%E3%5A' => '5A', '%E1%61' => 'E0', '%E2%61' => 'E1', '%E3%61' => 'E2', '%E4%61' => 'E3', '%E8%61' => 'E4', '%EA%61' => 'E5', '%E2%63' => '63', '%E3%63' => '63', '%F0%63' => 'E7', '%E1%65' => 'E8', '%E2%65' => 'E9', '%E3%65' => 'EA', '%E4%65' => '65', '%E8%65' => 'EB', '%F0%65' => '65', '%E2%67' => '67', '%E3%67' => '67', '%F0%67' => '67', '%E3%68' => '68', '%E8%68' => '68', '%F0%68' => '68', '%E1%69' => 'EC', '%E2%69' => 'ED', '%E3%69' => 'EE', '%E4%69' => '69', '%E8%69' => 'EF', '%E3%6A' => '6A', '%E2%6B' => '6B', '%E3%6B' => '6B', '%F0%6B' => '6B', '%F2%6B' => '6B', '%E2%6C' => '6C', '%E3%6C' => '6C', '%F0%6C' => '6C', '%E2%6D' => '6D', '%E1%6E' => '6E', '%E2%6E' => '6E', '%E4%6E' => 'F1', '%F0%6E' => '6E', '%E1%6F' => 'F2', '%E2%6F' => 'F3', '%E3%6F' => 'F4', '%E4%6F' => 'F5', '%E8%6F' => 'F6', '%E2%70' => '70', '%E2%72' => '72', '%E2%73' => '73', '%E3%73' => '73', '%F0%73' => '73', '%E8%74' => '74', '%F0%74' => '74', '%E1%75' => 'F9', '%E2%75' => 'FA', '%E3%75' => 'FB', '%E4%75' => '75', '%E8%75' => 'FC', '%EA%75' => '75', '%E4%76' => '76', '%E1%77' => '77', '%E2%77' => '77', '%E3%77' => '77', '%E8%77' => '77', '%EA%77' => '77', '%E8%78' => '78', '%E1%79' => '79', '%E2%79' => 'FD', '%E3%79' => '79', '%E4%79' => '79', '%E8%79' => '79', '%EA%79' => '79', '%E8%79' => 'FF', '%E2%7A' => '7A', '%E3%7A' => '7A', '%E2%A2' => '4F', '%E1%AC' => '4F', '%E2%AC' => '4F', '%E4%AC' => '4F', '%E1%AD' => '55', '%E2%AD' => '55', '%E4%AD' => '55', '%E2%B2' => '6F', '%E1%BC' => '6F', '%E2%BC' => '6F', '%E4%BC' => '6F', '%E1%BD' => '75', '%E2%BD' => '75', '%E4%BD' => '75' ); my %marc8_to_latin1_single = ( '%A2' => 'D8', '%A4' => 'DE', '%A5' => 'C6', '%A8' => 'B7', '%AA' => 'AE', '%AB' => 'B1', '%B2' => 'F8', '%B4' => 'FE', '%B5' => 'E6', '%B9' => 'A3', '%BA' => 'F0', '%C0' => 'B0', '%C3' => 'A9', '%C5' => 'BF', '%C6' => 'A1' ); my %marc8_fin_to_latin1_single = ( '%83' => 'C5', '%84' => 'C4', '%85' => 'D6', '%86' => 'E5', '%87' => 'E4', '%88' => 'F6' ); foreach my $char1 (@orphan_chars_combined) { $line =~ s/$char1//g; } foreach my $marc_char1 (keys (%marc8_to_latin1_combined)) { $line =~ s/$marc_char1/pack("C", hex($marc8_to_latin1_combined{$marc_char1}))/eg; } foreach my $char2 (@orphan_chars_single) { $line =~ s/$char2//g; } foreach my $marc_char2 (keys (%marc8_to_latin1_single)) { $line =~ s/$marc_char2/pack("C", hex($marc8_to_latin1_single{$marc_char2}))/eg; } if ($character_set eq "marc8-fin") { foreach my $marc_char3 (keys (%marc8_fin_to_latin1_single)) { $line =~ s/$marc_char3/pack("C", hex($marc8_fin_to_latin1_single{$marc_char3}))/eg; } } } elsif ($character_set eq "strip") { $line =~ s/%[A-F8-9][A-F0-9]//g; } $line =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/eg; return($line); } return 1;