1: #!/usr/bin/perl
2:
3: use strict;
4: use utf8;
5: use warnings;
6:
7: use File::Basename;
8: use Try::Tiny;
9:
10: use lib dirname(__FILE__);
11:
12: use pre_xml;
13: use html_to_xml;
14: use post_xml;
15:
16:
17: binmode(STDOUT, ':encoding(UTF-8)');
18:
19: if (scalar(@ARGV) != 1) {
20: print STDERR "Usage: perl clean_xml.pl file|directory\n";
21: exit(1);
22: }
23:
24: # find the command-line argument encoding
25: use I18N::Langinfo qw(langinfo CODESET);
26: my $codeset = langinfo(CODESET);
27: use Encode qw(decode);
28: @ARGV = map { decode $codeset, $_ } @ARGV;
29:
30: my $pathname = "$ARGV[0]";
31: if (-d "$pathname") {
32: $pathname =~ s/\/$//;
33: my $start = time();
34: my ($converted, $failures) = convert_dir($pathname);
35: my $end = time();
36: my $elapsed = $end - $start;
37: my $minutes = int($elapsed / 60);
38: my $seconds = $elapsed - ($minutes*60);
39: print "\n".scalar(@$converted)." files were converted in $minutes minutes $seconds seconds\n";
40: if (scalar(@$failures) > 0) {
41: print "\n".scalar(@$failures)." files need a manual fix:\n";
42: foreach my $failure (@$failures) {
43: print " $failure\n";
44: }
45: }
46: } elsif (-f $pathname) {
47: convert_file($pathname);
48: }
49:
50: # Converts a directory recursively, selecting only non-version .problem/exam/survey/html/library files.
51: # Returns a list of files that were converted, and a list of files that could not be converted.
52: sub convert_dir {
53: my ($dirpath) = @_;
54:
55: my @converted = ();
56: my @failures = ();
57: opendir (my $dh, $dirpath) or die $!;
58: while (my $entry = readdir($dh)) {
59: next if ($entry =~ m/^\./); # ignore entries starting with a period
60: my $pathname = $dirpath.'/'.$entry;
61: if (-d $pathname) {
62: my ($new_converted, $new_failures) = convert_dir($pathname);
63: push(@converted, @$new_converted);
64: push(@failures, @$new_failures);
65: } elsif (-f $pathname) {
66: # check that the file ends in .problem, .exam, .survey, .html or .htm but not .number.*
67: if (($pathname =~ /\.problem$/i || $pathname =~ /\.exam$/i || $pathname =~ /\.survey$/i ||
68: $pathname =~ /\.html?$/i || $pathname =~ /\.library$/i) &&
69: $pathname !~ /\.[0-9]+\.[a-z]+$/) {
70: try {
71: convert_file($pathname);
72: push(@converted, $pathname);
73: } catch {
74: print "$_\n"; # continue processing even if a file cannot be converted
75: push(@failures, $pathname);
76: };
77: }
78: }
79: }
80: closedir($dh);
81: return((\@converted, \@failures));
82: }
83:
84: # Converts a file, creating a .xml file in the same directory.
85: sub convert_file {
86: my ($pathname) = @_;
87:
88: # create a name for the new file
89: my $newpath = $pathname.'.xml';
90:
91: print "converting $pathname...\n";
92:
93: my $textref;
94: try {
95: $textref = pre_xml::pre_xml($pathname);
96: } catch {
97: die "pre_xml error for $pathname: $_";
98: };
99:
100: try {
101: $textref = html_to_xml::html_to_xml($textref);
102: } catch {
103: die "html_to_xml error for $pathname: $_";
104: };
105:
106: try {
107: post_xml::post_xml($textref, $newpath);
108: } catch {
109: die "post_xml error for $pathname: $_";
110: };
111: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>