File:
[LON-CAPA] /
loncom /
localize /
transliterate.pm
Revision
1.2:
download - view:
text,
annotated -
select for diffs
Sat Mar 2 23:08:51 2019 UTC (5 years, 8 months ago) by
raeburn
Branches:
MAIN
CVS tags:
version_2_12_X,
version_2_11_X,
version_2_11_5_msu,
version_2_11_5,
version_2_11_4_uiuc,
version_2_11_4_msu,
version_2_11_4,
version_2_11_3_uiuc,
version_2_11_3_msu,
version_2_11_3,
HEAD
- Bug 6792
Normalize unicode representations before transliteration.
# The LearningOnline Network with CAPA
# Transliteration to ascii
#
# $Id: transliterate.pm,v 1.2 2019/03/02 23:08:51 raeburn Exp $
#
# Copyright Michigan State University Board of Trustees
#
# This file is part of the LearningOnline Network with CAPA (LON-CAPA).
#
# LON-CAPA is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# LON-CAPA is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with LON-CAPA; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# /home/httpd/html/adm/gpl.txt
#
# http://www.lon-capa.org/
#
######################################################################
######################################################################
=pod
=head1 NAME
LONCAPA::transliterate - transliterate non-ascii characters
in filenames.
=head1 SYNOPSIS
When fed a filename it will replace instances of non-ascii
characters with transliterations.
=head1 OVERVIEW
Used to replace non-ascii character(s) with a transliteration
of the character(s) to ascii character(s).
If there are preferred replacements for a particular language
then those should be included in a separate subroutine which
is called before the transliteration of last resort (which is
done with Text::Unidecode).
=head1 SUBROUTINES
=cut
package LONCAPA::transliterate;
use strict;
use utf8;
use Text::Unidecode qw(unidecode);
use Encode qw(decode_utf8 encode_utf8);
use Unicode::Normalize qw(normalize);
=pod
=over
=item * fname_to_ascii()
Inputs: $fname (required), $language (optional)
Output: $fname
Replaces non-ascii characters with a transliteration
of the character to an ascii character (using Text::Unidecode)
If the language code is de, transliteration via
german_to_ascii() is used first to handle umlauts and eszett,
before using Text::Unidecode.
If other routines are added to support preferred transliteration
of non-ascii characters for specific languages, they should be
added as new subroutines to this file, and then called if the
language code has an appropriate value.
=back
=cut
sub fname_to_ascii {
my ($fname,$language) = @_;
if ($fname =~ /([^\x{00}-\x{7f}])/) {
$fname = decode_utf8($fname);
$fname = normalize('D',$fname);
$fname = normalize('C',$fname);
if ($language eq 'de') {
$fname = &german_to_ascii($fname);
}
$fname = unidecode($fname);
$fname = encode_utf8($fname);
}
return $fname;
}
=pod
=over
=item * german_to_ascii()
Input: $fname (required)
Output: $fname
Replaces letters with umlauts with the equivalent letter
without an umlaut plus letter e. Case is preserved.
Replaces eszett with double s.
=back
=cut
sub german_to_ascii {
my ($fname) = @_;
my %characters = (
'Ä' => 'AE',
'Ö' => 'OE',
'Ü' => 'UE',
'ä' => 'ae',
'ö' => 'oe',
'ü' => 'ue',
'ß' => 'ss',
);
$fname =~ s/([ÄäÖöÜüß])/$characters{$1}/g;
return $fname;
}
1;
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>