File:  [LON-CAPA] / loncom / localize / transliterate.pm
Revision 1.2: download - view: text, annotated - select for diffs
Sat Mar 2 23:08:51 2019 UTC (5 years, 9 months ago) by raeburn
Branches: MAIN
CVS tags: version_2_12_X, version_2_11_X, version_2_11_5_msu, version_2_11_5, version_2_11_4_uiuc, version_2_11_4_msu, version_2_11_4, version_2_11_3_uiuc, version_2_11_3_msu, version_2_11_3, HEAD
- Bug 6792
  Normalize unicode representations before transliteration.

# The LearningOnline Network with CAPA
# Transliteration to ascii
#
# $Id: transliterate.pm,v 1.2 2019/03/02 23:08:51 raeburn Exp $
#
# Copyright Michigan State University Board of Trustees
#
# This file is part of the LearningOnline Network with CAPA (LON-CAPA).
#
# LON-CAPA is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# LON-CAPA is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with LON-CAPA; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# /home/httpd/html/adm/gpl.txt
#
# http://www.lon-capa.org/
#
######################################################################
######################################################################

=pod

=head1 NAME

LONCAPA::transliterate - transliterate non-ascii characters
in filenames.

=head1 SYNOPSIS

When fed a filename it will replace instances of non-ascii
characters with transliterations.

=head1 OVERVIEW

Used to replace non-ascii character(s) with a transliteration
of the character(s) to ascii character(s).

If there are preferred replacements for a particular language
then those should be included in a separate subroutine which
is called before the transliteration of last resort (which is
done with Text::Unidecode).

=head1 SUBROUTINES

=cut

package LONCAPA::transliterate;

use strict;
use utf8;
use Text::Unidecode qw(unidecode);
use Encode qw(decode_utf8 encode_utf8);
use Unicode::Normalize qw(normalize);

=pod

=over

=item * fname_to_ascii()

Inputs: $fname (required), $language (optional)

Output: $fname

Replaces non-ascii characters with a transliteration
of the character to an ascii character (using Text::Unidecode) 

If the language code is de, transliteration via 
german_to_ascii() is used first to handle umlauts and eszett,
before using Text::Unidecode.

If other routines are added to support preferred transliteration
of non-ascii characters for specific languages, they should be
added as new subroutines to this file, and then called if the
language code has an appropriate value.

=back

=cut

sub fname_to_ascii {
    my ($fname,$language) = @_;
    if ($fname =~ /([^\x{00}-\x{7f}])/) {
        $fname = decode_utf8($fname);
        $fname = normalize('D',$fname);
        $fname = normalize('C',$fname);
        if ($language eq 'de') {
            $fname = &german_to_ascii($fname);
        }
        $fname = unidecode($fname);
        $fname = encode_utf8($fname);
    }
    return $fname;
}

=pod 

=over

=item * german_to_ascii()

Input: $fname (required)

Output: $fname

Replaces letters with umlauts with the equivalent letter
without an umlaut plus letter e. Case is preserved.

Replaces eszett with double s.

=back

=cut

sub german_to_ascii {
    my ($fname) = @_;
    my %characters = (
                       'Ä' => 'AE',
                       'Ö' => 'OE',
                       'Ü' => 'UE',
                       'ä' => 'ae',
                       'ö' => 'oe',
                       'ü' => 'ue',
                       'ß' => 'ss',
                      );
    $fname =~ s/([ÄäÖöÜüß])/$characters{$1}/g;
    return $fname;
}

1;

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>