Michael Doran Home Page
Contact | Site Map | Search  
This page is deprecated: please read archives disclaimer.

MARC to Latin

a charset conversion routine in Perl

Introduction | Source Code

Download: MARCtoLatin.pl

########################################################################
#
#  MARCtoLatin.pl
#
#  Version: 1.1
#
#  Created by Michael Doran, doran@uta.edu
#
#  University of Texas at Arlington Libraries
#  Box 19497, Arlington, TX 76019, USA
#
#  This function was originally written for newbooks.pl, a component
#  of the New Books List, an unofficial add-on for the Voyager 
#  integrated library management system.  For more information on
#  the New Books List, see http://rocky.uta.edu/doran/autolist/.
#
########################################################################
#
#  Changes:
#
#  1.1 Per Michael Koehn suggestion, added check for presence of
#      high-order bytes before attempting to do any translation
#      and simply return line if none found. 
#
########################################################################

########################################################################
#
#  Using the function
#
#  To use this function, select the desired character set below and
#  then add this line to your program: require "MARCtoLatin.pl";
#
########################################################################

########################################################################
#
#  This function takes a string extracted from a MARC record
#  and converts it from its native character set (e.g. MARC-8
#  for USMARC records) to Latin-1 (ISO-8859-1).
#
#  Note: Be aware that not all MARC8 characters can be converted,
#  since there are more combinations available in MARC8 than have
#  Latin 1 equivalents.  Non-spacing characters that can not be
#  converted, are stripped out.  Therefore this is not recommended
#  for programs/projects that will eventually return the data to
#  a library database.  The original intention was a one-way
#  conversion to make MARC data palatable for display on the web.
#
#  Most USA/Great Britain/Australia/New Zealand sites will 
#  want to select the marc8 option. 
#
#  Finnish sites will have a choice of marc8-fin or finmarc
#  depending on the type of bib records used.
#
#  The ASCII option removes all the non-spacing MARC-8
#  characters: you won't get any nonsense characters, nor
#  will you end up with any diacritics or special characters.
#  There may be valid reasons to choose this option related
#  to sorting and/or keyword searching. 
#
#  If your records have been converted to Unicode you probably
#  don't want any conversion done.
#
#  If your		   choose           to end
#  catalog uses  	 this option	    up with
#  -------------    	 -----------	    -------
#  MARC 21 records    ->  "marc8"	->  Latin-1
#  MARC 21 w/ Finnish ->  "marc8-fin"   ->  Latin-1
#  FINMARC records    ->  "finmarc"	->  Latin-1
#  (any record type)  ->  "strip"	->   ASCII
#  ** Unicode **      ->  "passthru"	-> No change
#
#  Choices are marc8, marc8-fin, finmarc, or strip.

my $character_set = "marc8";

#  Within your program, you would pass the function a
#  string that you would like to be converted. 
#  Something like this...
#  	my $ConvertedAuthor = &CharConv($author);

sub CharConv {

  my ($line) = @_;

  if ($character_set eq "passthru") {
      return($line);
  }

  # Check for high-order characters ( > hex 7F).  If none found,
  # no conversion needed.  (Added by MK, 2005)
  if ($line !~ /[\x80-\xff]/) {
      return($line);
  }

  # Convert input string to hexadecimal
  $line =~ s/(.)/sprintf ("%%%X", ord($1))/eg;

  if ($character_set eq "finmarc") {
    # This subroutine takes data encoded with the FINMARC
    # character set (ISO-6937/2) and (as best as can be done)
    # maps it to the ISO 8859-1 (Latin-1) character set.

    my %iso6937_to_latin1_doubles = (
	'%E6' => '%49%4A', '%EF' => '%27%6E', '%F6' => '%69%6A',
    );

    my %iso6937_to_latin1 = (
	'%A4'    => '24', '%C8%20' => 'A8', '%D3'    => 'A9',
	'%E3'    => 'AA', '%D2'    => 'AE', '%C2%20' => 'B4',
	'%CB%20' => 'B8', '%D1'    => 'B9', '%EB'    => 'BA',
	'%C1%41' => 'C0', '%C2%41' => 'C1', '%C3%41' => 'C2',
	'%C4%41' => 'C3', '%C8%41' => 'C4', '%CA%41' => 'C5',
	'%E1'    => 'C6', '%CB%43' => 'C7', '%C1%45' => 'C8',
	'%C2%45' => 'C9', '%C3%45' => 'CA', '%C8%45' => 'CB',
	'%C1%49' => 'CC', '%C2%49' => 'CD', '%C3%49' => 'CE',
	'%C8%49' => 'CF', '%E2'    => 'D0', '%C4%4E' => 'D1',
	'%C1%4F' => 'D2', '%C2%4F' => 'D3', '%C3%4F' => 'D4',
	'%C4%4F' => 'D5', '%C8%4F' => 'D6', '%B4'    => 'D7',
	'%E9'    => 'D8', '%C1%55' => 'D9', '%C2%55' => 'DA',
	'%C3%55' => 'DB', '%C8%55' => 'DC', '%C2%59' => 'DD',
	'%EC'    => 'DE', '%FB'    => 'DF', '%C1%61' => 'E0',
	'%C2%61' => 'E1', '%C3%61' => 'E2', '%C4%61' => 'E3',
	'%C8%61' => 'E4', '%CA%61' => 'E5', '%F1'    => 'E6',
	'%CB%63' => 'E7', '%C1%65' => 'E8', '%C2%65' => 'E9',
	'%C3%65' => 'EA', '%C8%65' => 'EB', '%C1%69' => 'EC',
	'%C2%69' => 'ED', '%C3%69' => 'EE', '%C8%69' => 'EF',
	'%F3'    => 'F0', '%C4%6E' => 'F1', '%C1%6F' => 'F2',
	'%C2%6F' => 'F3', '%C3%6F' => 'F4', '%C4%6F' => 'F5',
	'%C8%6F' => 'F6', '%B8'    => 'F7', '%F9'    => 'F8',
	'%C1%75' => 'F9', '%C2%75' => 'FA', '%C3%75' => 'FB',
	'%C8%75' => 'FC', '%C2%79' => 'FD', '%FC'    => 'FE',
	'%C8%79' => 'FF', '%A9'    => '27', '%AA'    => '22', 
	'%B9'    => '27', '%BA'    => '22', '%E4'    => '48', 
	'%E7'    => '4C', '%E8'    => '4C', '%ED'    => '54', 
	'%EE'    => '4E', '%F2'    => '64', '%F4'    => '68', 
	'%F5'    => '69', '%F7'    => '6C', '%F8'    => '6C', 
	'%FD'    => '74', '%EE'    => '6E'
    );

    my @orphan_chars_iso6937 = (
	'A6', 'AC', 'AD', 'AF', 'C1', 'C2', 'C3', 'C4', 'C5',
	'C6', 'C7', 'C8', 'CA', 'CB', 'CC', 'CD', 'CE', 'CF',
	'D4', 'D5', 'DC', 'DD', 'DE', 'DF', 'E0'
	);

    foreach my $finmarc_char (keys (%iso6937_to_latin1_doubles)) {
	$line =~ s/$finmarc_char/$iso6937_to_latin1_doubles{$finmarc_char}/eg;
    }
    foreach my $finmarc_char2 (keys (%iso6937_to_latin1)) {
	$line =~ s/$finmarc_char2/pack("C", hex($iso6937_to_latin1{$finmarc_char2}))/eg;
    }
    foreach my $orphan (@orphan_chars_iso6937) {
	$line =~ s/$orphan//g;
    }

  } elsif ($character_set eq "marc8" ||
	   $character_set eq "marc8-fin") {

    # This subroutine takes data encoded with the MARC-8
    # character set and (as best as can be done) maps it
    # to the ISO 8859-1 (Latin-1) character set.

    # First, combined MARC-8 characters that don't 
    # correspond to any Latin-1 characters are removed. 
    # Then combined MARC characters that correspond to 
    # extended Latin-1 characters are replaced with the 
    # appropriate values. This process is then repeated
    # for single values.

    # MARC8-fin records get a few additional C1 control
    # characters translated into graphic characters.

    my @orphan_chars_combined = (
	'%1B%67%61%1B%73', '%1B%67%62%1B%73', '%1B%67%63%1B%73', 
	'%1B%62%30%1B%73', '%1B%62%31%1B%73', '%1B%62%32%1B%73',
	'%1B%62%33%1B%73', '%1B%62%34%1B%73', '%1B%62%35%1B%73', 
	'%1B%62%36%1B%73', '%1B%62%37%1B%73', '%1B%62%38%1B%73',
	'%1B%62%39%1B%73', '%1B%62%28%1B%73', '%1B%62%2B%1B%73', 
	'%1B%62%29%1B%73', '%1B%70%30%1B%73', '%1B%70%31%1B%73',
	'%1B%70%32%1B%73', '%1B%70%33%1B%73', '%1B%70%34%1B%73', 
	'%1B%70%35%1B%73', '%1B%70%36%1B%73', '%1B%70%37%1B%73',
	'%1B%70%38%1B%73', '%1B%70%39%1B%73', '%1B%70%28%1B%73', 
	'%1B%70%2D%1B%73', '%1B%70%2B%1B%73', '%1B%70%29%1B%73',
	'%E0%E6', '%E1%E3', '%E1%E5', '%E1%E6', '%E1%E8', 
	'%E2%A5', '%E2%B5', '%E2%E4', '%E2%E5', '%E2%E6', 
	'%E2%E8', '%E2%EA', '%E2%F0', '%E3%E0', '%E3%E1', 
	'%E3%E2', '%E3%F2', '%E4%E3', '%E4%E6', '%E5%E4',
	'%E5%E7', '%E5%E8', '%E5%A5', '%E5%B5', '%E5%F1', 
	'%E5%F2', '%E6%F0', '%E6%F2', '%E7%E2', '%E7%F2', 
	'%E8%E4', '%E8%E5', '%E9%E8'
	);

    my @orphan_chars_single = (
	'%A1','%A3','%A6','%A7','%A9','%AC','%AD','%AE',
	'%AF','%B0','%B1','%B3','%B6','%B7','%B8','%BB',
	'%BC','%BD','%BE','%BF','%C1','%C2','%C4','%E0',
	'%E5','%E6','%E7','%E9','%EB','%EC','%ED','%EE',
	'%EF','%F1','%F2','%F3','%F4','%F5','%F6','%F7',
	'%F8','%F9','%FA','%FB','%FC','%FD','%FE'
	);

    my %marc8_to_latin1_combined = (
	'%1B%70%32%1B%73' => 'B2', '%1B%70%33%1B%73' => 'B3',
	'%1B%70%31%1B%73' => 'B9', '%E1%41' => 'C0',
	'%E2%41' => 'C1', '%E3%41' => 'C2', '%E4%41' => 'C3', 
	'%E8%41' => 'C4', '%EA%41' => 'C5', '%E2%43' => '43',
	'%E3%43' => '43', '%F0%43' => 'C7', '%E1%45' => 'C8', 
	'%E2%45' => 'C9', '%E3%45' => 'CA', '%E4%45' => '45',
	'%E8%45' => 'CB', '%F0%45' => '45', '%E2%47' => '47', 
	'%E3%47' => '47', '%F0%47' => '47', '%E3%48' => '48',
	'%E8%48' => '48', '%F0%48' => '48', '%E1%49' => 'CC', 
	'%E2%49' => 'CD', '%E3%49' => 'CE', '%E4%49' => '49',
	'%E8%49' => 'CF', '%E3%4A' => '4A', '%E2%4B' => '4B', 
	'%E3%4B' => '4B', '%F0%4B' => '4B', '%F2%4B' => '4B',
        '%E2%4C' => '4C', '%E3%4C' => '4C', '%F0%4C' => '4C', 
	'%E2%4D' => '4D', '%E1%4E' => '4E', '%E2%4E' => '4E',
	'%E4%4E' => 'D1', '%F0%4E' => '4E', '%E1%4F' => 'D2', 
	'%E2%4F' => 'D3', '%E3%4F' => 'D4', '%E4%4F' => 'D5',
	'%E8%4F' => 'D6', '%E2%50' => '50', '%E2%52' => '52', 
	'%E2%53' => '53', '%E3%53' => '53', '%F0%53' => '53',
	'%F0%54' => '54', '%E1%55' => 'D9', '%E2%55' => 'DA', 
	'%E3%55' => 'DB', '%E4%55' => '55', '%E8%55' => 'DC',
	'%EA%55' => '55', '%E4%56' => '56', '%E1%57' => '57', 
	'%E2%57' => '57', '%E3%57' => '57', '%E8%57' => '57',
	'%E8%58' => '58', '%E1%59' => '59', '%E2%59' => 'DD', 
	'%E3%59' => '59', '%E4%59' => '59', '%E8%59' => '59',
	'%E2%5A' => '5A', '%E3%5A' => '5A', '%E1%61' => 'E0', 
	'%E2%61' => 'E1', '%E3%61' => 'E2', '%E4%61' => 'E3',
	'%E8%61' => 'E4', '%EA%61' => 'E5', '%E2%63' => '63', 
	'%E3%63' => '63', '%F0%63' => 'E7', '%E1%65' => 'E8',
	'%E2%65' => 'E9', '%E3%65' => 'EA', '%E4%65' => '65', 
	'%E8%65' => 'EB', '%F0%65' => '65', '%E2%67' => '67',
	'%E3%67' => '67', '%F0%67' => '67', '%E3%68' => '68', 
	'%E8%68' => '68', '%F0%68' => '68', '%E1%69' => 'EC',
	'%E2%69' => 'ED', '%E3%69' => 'EE', '%E4%69' => '69', 
	'%E8%69' => 'EF', '%E3%6A' => '6A', '%E2%6B' => '6B',
        '%E3%6B' => '6B', '%F0%6B' => '6B', '%F2%6B' => '6B', 
	'%E2%6C' => '6C', '%E3%6C' => '6C', '%F0%6C' => '6C',
        '%E2%6D' => '6D', '%E1%6E' => '6E', '%E2%6E' => '6E', 
	'%E4%6E' => 'F1', '%F0%6E' => '6E', '%E1%6F' => 'F2',
	'%E2%6F' => 'F3', '%E3%6F' => 'F4', '%E4%6F' => 'F5', 
	'%E8%6F' => 'F6', '%E2%70' => '70', '%E2%72' => '72',
	'%E2%73' => '73', '%E3%73' => '73', '%F0%73' => '73', 
	'%E8%74' => '74', '%F0%74' => '74', '%E1%75' => 'F9',
	'%E2%75' => 'FA', '%E3%75' => 'FB', '%E4%75' => '75', 
	'%E8%75' => 'FC', '%EA%75' => '75', '%E4%76' => '76',
	'%E1%77' => '77', '%E2%77' => '77', '%E3%77' => '77', 
	'%E8%77' => '77', '%EA%77' => '77', '%E8%78' => '78',
	'%E1%79' => '79', '%E2%79' => 'FD', '%E3%79' => '79', 
	'%E4%79' => '79', '%E8%79' => '79', '%EA%79' => '79',
	'%E8%79' => 'FF', '%E2%7A' => '7A', '%E3%7A' => '7A', 
	'%E2%A2' => '4F', '%E1%AC' => '4F', '%E2%AC' => '4F',
	'%E4%AC' => '4F', '%E1%AD' => '55', '%E2%AD' => '55', 
	'%E4%AD' => '55', '%E2%B2' => '6F', '%E1%BC' => '6F',
	'%E2%BC' => '6F', '%E4%BC' => '6F', '%E1%BD' => '75', 
	'%E2%BD' => '75', '%E4%BD' => '75'
	);

    my %marc8_to_latin1_single = (
	'%A2'	=> 'D8', '%A4'	=> 'DE', '%A5'	=> 'C6', 
	'%A8'	=> 'B7', '%AA'	=> 'AE', '%AB'	=> 'B1',
	'%B2'	=> 'F8', '%B4'	=> 'FE', '%B5'	=> 'E6', 
	'%B9'	=> 'A3', '%BA'	=> 'F0', '%C0'	=> 'B0',
	'%C3'	=> 'A9', '%C5'	=> 'BF', '%C6'	=> 'A1'
	);

    my %marc8_fin_to_latin1_single = (
	'%83'	=> 'C5', '%84'	=> 'C4', '%85'	=> 'D6', 
	'%86'	=> 'E5', '%87'	=> 'E4', '%88'	=> 'F6'
	);

    foreach my $char1 (@orphan_chars_combined) {
	$line =~ s/$char1//g;
    }
    foreach my $marc_char1 (keys (%marc8_to_latin1_combined)) {
	$line =~ s/$marc_char1/pack("C", hex($marc8_to_latin1_combined{$marc_char1}))/eg;
    }
    foreach my $char2 (@orphan_chars_single) {
	$line =~ s/$char2//g;
    }
    foreach my $marc_char2 (keys (%marc8_to_latin1_single)) {
	$line =~ s/$marc_char2/pack("C", hex($marc8_to_latin1_single{$marc_char2}))/eg;
    }
    if ($character_set eq "marc8-fin") {
	foreach my $marc_char3 (keys (%marc8_fin_to_latin1_single)) {
	    $line =~ s/$marc_char3/pack("C", hex($marc8_fin_to_latin1_single{$marc_char3}))/eg;
	}
    }

  } elsif ($character_set eq "strip") {
      $line =~ s/%[A-F8-9][A-F0-9]//g;
  }

  $line =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/eg;
  return($line);

}

return 1;