Annotation of loncom/localize/transliterate.pm, revision 1.2
1.1 raeburn 1: # The LearningOnline Network with CAPA
2: # Transliteration to ascii
3: #
1.2 ! raeburn 4: # $Id: transliterate.pm,v 1.1 2019/02/26 14:42:22 raeburn Exp $
1.1 raeburn 5: #
6: # Copyright Michigan State University Board of Trustees
7: #
8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
9: #
10: # LON-CAPA is free software; you can redistribute it and/or modify
11: # it under the terms of the GNU General Public License as published by
12: # the Free Software Foundation; either version 2 of the License, or
13: # (at your option) any later version.
14: #
15: # LON-CAPA is distributed in the hope that it will be useful,
16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18: # GNU General Public License for more details.
19: #
20: # You should have received a copy of the GNU General Public License
21: # along with LON-CAPA; if not, write to the Free Software
22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23: #
24: # /home/httpd/html/adm/gpl.txt
25: #
26: # http://www.lon-capa.org/
27: #
28: ######################################################################
29: ######################################################################
30:
31: =pod
32:
33: =head1 NAME
34:
35: LONCAPA::transliterate - transliterate non-ascii characters
36: in filenames.
37:
38: =head1 SYNOPSIS
39:
40: When fed a filename it will replace instances of non-ascii
41: characters with transliterations.
42:
43: =head1 OVERVIEW
44:
45: Used to replace non-ascii character(s) with a transliteration
46: of the character(s) to ascii character(s).
47:
48: If there are preferred replacements for a particular language
49: then those should be included in a separate subroutine which
50: is called before the transliteration of last resort (which is
51: done with Text::Unidecode).
52:
53: =head1 SUBROUTINES
54:
55: =cut
56:
57: package LONCAPA::transliterate;
58:
59: use strict;
60: use utf8;
61: use Text::Unidecode qw(unidecode);
62: use Encode qw(decode_utf8 encode_utf8);
1.2 ! raeburn 63: use Unicode::Normalize qw(normalize);
1.1 raeburn 64:
65: =pod
66:
67: =over
68:
69: =item * fname_to_ascii()
70:
71: Inputs: $fname (required), $language (optional)
72:
73: Output: $fname
74:
75: Replaces non-ascii characters with a transliteration
76: of the character to an ascii character (using Text::Unidecode)
77:
78: If the language code is de, transliteration via
79: german_to_ascii() is used first to handle umlauts and eszett,
80: before using Text::Unidecode.
81:
82: If other routines are added to support preferred transliteration
83: of non-ascii characters for specific languages, they should be
84: added as new subroutines to this file, and then called if the
85: language code has an appropriate value.
86:
87: =back
88:
89: =cut
90:
91: sub fname_to_ascii {
92: my ($fname,$language) = @_;
93: if ($fname =~ /([^\x{00}-\x{7f}])/) {
1.2 ! raeburn 94: $fname = decode_utf8($fname);
! 95: $fname = normalize('D',$fname);
! 96: $fname = normalize('C',$fname);
1.1 raeburn 97: if ($language eq 'de') {
98: $fname = &german_to_ascii($fname);
99: }
100: $fname = unidecode($fname);
1.2 ! raeburn 101: $fname = encode_utf8($fname);
1.1 raeburn 102: }
103: return $fname;
104: }
105:
106: =pod
107:
108: =over
109:
110: =item * german_to_ascii()
111:
112: Input: $fname (required)
113:
114: Output: $fname
115:
116: Replaces letters with umlauts with the equivalent letter
117: without an umlaut plus letter e. Case is preserved.
118:
119: Replaces eszett with double s.
120:
121: =back
122:
123: =cut
124:
125: sub german_to_ascii {
126: my ($fname) = @_;
127: my %characters = (
128: 'Ä' => 'AE',
129: 'Ö' => 'OE',
130: 'Ü' => 'UE',
131: 'ä' => 'ae',
132: 'ö' => 'oe',
133: 'ü' => 'ue',
134: 'ß' => 'ss',
135: );
136: $fname =~ s/([ÄäÖöÜüß])/$characters{$1}/g;
137: return $fname;
138: }
139:
140: 1;
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>