Index: lib/TWiki.pm
===================================================================
--- lib/TWiki.pm (revision 14213)
+++ lib/TWiki.pm (working copy)
@@ -219,7 +219,7 @@
ICONURLPATH => \&ICONURLPATH,
IF => \&IF,
INCLUDE => \&INCLUDE,
- INTURLENCODE => \&INTURLENCODE,
+ INTURLENCODE => \&INTURLENCODE_deprecated,
LANGUAGES => \&LANGUAGES,
MAKETEXT => \&MAKETEXT,
META => \&META,
@@ -341,12 +341,13 @@
# Load POSIX for I18N support.
require POSIX;
- import POSIX qw( locale_h LC_CTYPE );
+ import POSIX qw( locale_h LC_CTYPE LC_COLLATE );
# SMELL: mod_perl compatibility note: If TWiki is running under Apache,
# won't this play with the Apache process's locale settings too?
# What effects would this have?
setlocale(&LC_CTYPE, $TWiki::cfg{Site}{Locale});
+ setlocale(&LC_COLLATE, $TWiki::cfg{Site}{Locale});
}
$functionTags{CHARSET} = sub { $TWiki::cfg{Site}{CharSet} ||
@@ -420,8 +421,10 @@
# characters allowed
$regex{emailAddrRegex} = qr/([A-Za-z0-9\.\+\-\_]+\@[A-Za-z0-9\.\-]+)/;
- # Filename regex, for attachments
- $regex{filenameRegex} = qr/[$regex{mixedAlphaNum}\.]+/o;
+ # Filename regex to filter out invalid characters in attachments - allow
+ # I18N characters, spaces, etc
+ $regex{filenameInvalidCharRegex} = qr/[^$regex{mixedAlphaNum}\. _-]/o;
+ # $regex{filenameInvalidCharRegex} = "[^$regex{mixedAlphaNum}\. _-]";
# Multi-character alpha-based regexes
$regex{mixedAlphaNumRegex} = qr/[$regex{mixedAlphaNum}]*/o;
@@ -532,17 +535,18 @@
if ( $TWiki::cfg{Site}{CharSet} =~ /^utf-?8$/i ) {
# warn if using Perl older than 5.8
if( $] < 5.008 ) {
- $this->writeWarning( 'UTF-8 not supported on Perl '.$].
+ $this->writeWarning( 'UTF-8 not remotely supported on Perl '.$].
' - use Perl 5.8 or higher..' );
}
- # SMELL: is this true yet?
+ # We still don't have Codev.UnicodeSupport
$this->writeWarning( 'UTF-8 not yet supported as site charset -'.
'TWiki is likely to have problems' );
return $text;
}
- # Convert into ISO-8859-1 if it is the site charset
+ # Convert into ISO-8859-1 if it is the site charset. This conversion
+ # is *not valid for ISO-8859-15*.
if ( $TWiki::cfg{Site}{CharSet} =~ /^iso-?8859-?1$/i ) {
# ISO-8859-1 maps onto first 256 codepoints of Unicode
# (conversion from 'perldoc perluniintro')
@@ -629,9 +633,9 @@
}
unless( $this->inContext('command_line')) {
- # can't use simple length() in case we have UNICODE
- # see perldoc -f length
- my $len = do { use bytes; length( $text ); };
+ # FIXME: Defer next line until we have Codev.UnicodeSupport - too 5.8 dependent
+ # my $len = do { use bytes; length( $text ); };
+ my $len = length( $text );
$this->writePageHeader( undef, $pageType, $contentType, $len );
}
print $text;
@@ -955,6 +959,50 @@
=pod
+=pod
+
+---++ StaticMethod cleanFilename( $filename ) -> $cleanedFilename
+
+Clean up attachment filename, including only alphanumeric filename
+characters (including I18N characters). Located in TWiki.pm because
+it's hard to get I18N regexes to work in Sandbox.pm.
+
+=cut
+
+sub cleanFilename {
+ my( $filename ) = @_;
+
+ my $this = $TWiki::THIS;
+
+ if ( $TWiki::cfg{UseLocale} ) {
+ # Set environment variables for grep
+ $ENV{LC_CTYPE} = $TWiki::cfg{Site}{Locale};
+
+ require POSIX;
+ import POSIX qw( locale_h LC_CTYPE LC_COLLATE );
+
+ setlocale(&LC_CTYPE, $TWiki::cfg{Site}{Locale});
+ setlocale(&LC_COLLATE, $TWiki::cfg{Site}{Locale});
+ }
+ if ( exists $INC{'locale.pm'} ) {
+ $TWiki::THIS->writeDebug( "locale.pm loaded'");
+ } else {
+ $TWiki::THIS->writeDebug( "locale.pm not loaded'");
+ }
+
+ $this->writeDebug( "Regex applied is '$TWiki::regex{filenameInvalidCharRegex}' ");
+
+ # TODO: Make this configurable to no validation for Chinese/Japanese
+ $filename =~ s/$regex{filenameInvalidCharRegex}//g;
+ return $filename;
+}
+
+
+=pod
+
+
+=pod
+
---++ StaticMethod isValidWebName( $name, $system ) -> $boolean
STATIC Check for a valid web name. If $system is true, then
@@ -1158,8 +1206,14 @@
$this->normalizeWebTopicName( $web, $topic );
my $path = '/'.$web.'/'.$topic;
- $path .= '/'.$attachment if $attachment;
- $url .= urlEncode( $path );
+ if( $attachment ) {
+ $path .= '/'.$attachment;
+ # Attachments are served directly by web server, need to handle
+ # URL encoding specially
+ $url .= urlEncodeAttachment ( $path );
+ } else {
+ $url .= urlEncode( $path );
+ }
}
return $url;
@@ -1264,6 +1318,9 @@
=cut
+# RD: HACK for debugging
+our $THIS;
+
sub new {
my( $class, $login, $query, $initialContext ) = @_;
@@ -1271,6 +1328,8 @@
$query ||= new CGI( {} );
my $this = bless( {}, $class );
+ $THIS = $this; # RD: HACK for debugging
+
$this->{_HTMLHEADERS} = {};
$this->{context} = $initialContext || {};
@@ -1359,9 +1418,9 @@
$web = $TWiki::cfg{UsersWebName} unless $web;
$this->{webName} = TWiki::Sandbox::untaintUnchecked( $web );
- # Convert UTF-8 web and topic name from URL into site charset
- # if necessary - no effect if URL is not in UTF-8
- # handle topic and web names seperately; encoding is not necessarily shared
+ # Convert UTF-8 web and topic name from URL into site charset if necessary
+ # SMELL: merge these two cases, browsers just don't mix two encodings in one URL
+ # - can also simplify into 2 lines by making function return unprocessed text if no conversion
my $webNameTemp = $this->UTF82SiteCharSet( $this->{webName} );
if ( $webNameTemp ) {
$this->{webName} = $webNameTemp;
@@ -1374,6 +1433,7 @@
# Item3270 - here's the appropriate place to enforce TWiki spec:
# All topic name sources are evaluated, site charset applied
+ # SMELL: This untaint unchecked is duplicate of one just above
$this->{topicName} =
TWiki::Sandbox::untaintUnchecked(ucfirst $this->{topicName});
@@ -2185,6 +2245,47 @@
=pod
+---++ StaticMethod urlEncodeAttachment ( $text )
+
+For attachments, URL-encode specially to 'freeze' any characters >127 in the
+site charset (e.g. ISO-8859-1 or KOI8-R), by doing URL encoding into native
+charset ($siteCharset) - used when generating attachment URLs, to enable the
+web server to serve attachments, including images, directly.
+
+This encoding is required to handle the cases of:
+
+ - browsers that generate UTF-8 URLs automatically from site charset URLs - now quite common
+ - web servers that directly serve attachments, using the site charset for
+ filenames, and cannot convert UTF-8 URLs into site charset filenames
+
+The aim is to prevent the browser from converting a site charset URL in the web
+page to a UTF-8 URL, which is the default. Hence we 'freeze' the URL into the
+site character set through URL encoding.
+
+In two cases, no URL encoding is needed: For EBCDIC mainframes, we assume that
+site charset URLs will be translated (outbound and inbound) by the web server to/from an
+EBCDIC character set. For sites running in UTF-8, there's no need for TWiki to
+do anything since all URLs and attachment filenames are already in UTF-8.
+
+=cut
+
+sub urlEncodeAttachment {
+ my( $text ) = @_;
+
+ my $usingEBCDIC = ( 'A' eq chr(193) ); # Only true on EBCDIC mainframes
+
+ if( $TWiki::cfg{Site}{CharSet} eq "utf-8" or $usingEBCDIC ) {
+ # Just let browser do UTF-8 URL encoding
+ return $text;
+ }
+
+ # Freeze into site charset through URL encoding
+ return urlEncode( $text );
+}
+
+
+=pod
+
---++ StaticMethod urlEncode( $string ) -> encoded string
Encode by converting characters that are illegal in URLs to
@@ -2194,18 +2295,22 @@
characters such as = and ?.
RFC 1738, Dec. '94:
->
-...Only alphanumerics [0-9a-zA-Z], the special
-characters $-_.+!*'(), and reserved characters used for their
-reserved purposes may be used unencoded within a URL.
-
+
+ ...Only alphanumerics [0-9a-zA-Z], the special
+ characters $-_.+!*'(), and reserved characters used for their
+ reserved purposes may be used unencoded within a URL.
+
+
Reserved characters are $&+,/:;=?@ - these are _also_ encoded by
this method.
-SMELL: For non-ISO-8859-1 $TWiki::cfg{Site}{CharSet}, need to convert to
-UTF-8 before URL encoding. This encoding only supports 8-bit
-character codes.
+This URL-encoding handles all character encodings including ISO-8859-*,
+KOI8-R, EUC-* and UTF-8.
+This may not handle EBCDIC properly, as it generates an EBCDIC URL-encoded
+URL, but mainframe web servers seem to translate this outbound before it hits browser
+- see CGI::Util::escape for another approach.
+
=cut
sub urlEncode {
@@ -3367,7 +3472,7 @@
# TWiki Feb2003 release - encoding is no longer needed since UTF-URLs are now
# directly supported, but it is provided for backward compatibility with
# skins that may still be using the deprecated %INTURLENCODE%.
-sub INTURLENCODE {
+sub INTURLENCODE_deprecated {
my( $this, $params ) = @_;
# Just strip double quotes, no URL encoding - Mozilla UTF-8 URLs
# directly supported now
Index: lib/TWiki/Attach.pm
===================================================================
--- lib/TWiki/Attach.pm (revision 14213)
+++ lib/TWiki/Attach.pm (working copy)
@@ -275,6 +275,8 @@
my $imgSize = '';
my $prefs = $this->{session}->{prefs};
my $store = $this->{session}->{store};
+ # I18N: URL-encode the attachment filename
+ my $fileURL = TWiki::urlEncodeAttachment( $attName );
if( $attName =~ /\.(gif|jpg|jpeg|png)$/i ) {
# inline image
@@ -296,7 +298,7 @@
$fileLink = $prefs->getPreferencesValue( 'ATTACHEDIMAGEFORMAT' );
unless( $fileLink ) {
- push( @attrs, src=>"%ATTACHURLPATH%/$attName" );
+ push( @attrs, src=>"%ATTACHURLPATH%/$fileURL" );
push( @attrs, alt=>$attName );
return " * $fileComment: ".CGI::br().CGI::img({ @attrs });
}
@@ -304,11 +306,15 @@
# normal attached file
$fileLink = $prefs->getPreferencesValue( 'ATTACHEDFILELINKFORMAT' );
unless( $fileLink ) {
- return " * [[%ATTACHURL%/$attName][$attName]]: $fileComment";
+ return " * [[%ATTACHURL%/$fileURL][$attName]]: $fileComment";
}
}
- $fileLink =~ s/\$name/$attName/g;
+ # I18N: Site specified %ATTACHEDIMAGEFORMAT% or %ATTACHEDFILELINKFORMAT%,
+ # ensure that filename is URL encoded - first $name must be URL.
+ $fileLink =~ s/\$name/$fileURL/;
+ $fileLink =~ s/\$name/$attName/;
+
$fileLink =~ s/\$comment/$fileComment/g;
$fileLink =~ s/\$size/$imgSize/g;
$fileLink =~ s/\\t/\t/go;
@@ -328,8 +334,7 @@
my( $x, $y) = ( 0, 0 );
if( defined( $file ) ) {
- # for crappy MS OSes - Win/Dos/NT use is NOT SUPPORTED
- binmode( $file );
+ binmode( $file ); # For Windows
my $s;
return ( 0, 0 ) unless ( read( $file, $s, 4 ) == 4 );
seek( $file, 0, 0 );
Index: lib/TWiki/Store/RcsFile.pm
===================================================================
--- lib/TWiki/Store/RcsFile.pm (revision 14213)
+++ lib/TWiki/Store/RcsFile.pm (working copy)
@@ -59,18 +59,14 @@
my( $class, $session, $web, $topic, $attachment ) = @_;
my $this = bless( { session => $session }, $class );
- utf8::downgrade( $web ) if( $web && $] >= 5.008 );
$this->{web} = $web;
if( $topic ) {
my $rcsSubDir = ( $TWiki::cfg{RCS}{useSubDir} ? '/RCS' : '' );
- utf8::downgrade( $topic ) if( $] >= 5.008 );
$this->{topic} = $topic;
if( $attachment ) {
- utf8::downgrade( $attachment ) if( $] >= 5.008 );
-
$this->{attachment} = $attachment;
$this->{file} = $TWiki::cfg{PubDir}.'/'.$web.'/'.
Index: lib/TWiki/Configure/Checkers/Site/Locale.pm
===================================================================
--- lib/TWiki/Configure/Checkers/Site/Locale.pm (revision 14213)
+++ lib/TWiki/Configure/Checkers/Site/Locale.pm (working copy)
@@ -56,7 +56,7 @@
# Set the default site charset
unless( defined( $TWiki::cfg{Site}{CharSet}) ) {
- $TWiki::cfg{Site}{CharSet} = 'iso-8859-15';
+ $TWiki::cfg{Site}{CharSet} = 'iso-8859-1';
}
# Extract the default site language - ignores '@euro' part of
Index: lib/TWiki/Sandbox.pm
===================================================================
--- lib/TWiki/Sandbox.pm (revision 14213)
+++ lib/TWiki/Sandbox.pm (working copy)
@@ -35,6 +35,14 @@
use Error qw( :try );
use File::Spec;
+BEGIN {
+ # Do a dynamic 'use locale' for this module
+ if( $TWiki::cfg{UseLocale} ) {
+ require locale;
+ import locale();
+ }
+}
+
# TODO: Sandbox module should probably use custom 'die' handler so that
# output goes only to web server error log - otherwise it might give
# useful debugging information to someone developing an exploit.
@@ -180,58 +188,77 @@
=cut
sub sanitizeAttachmentName {
- my $fileName = shift;
+ my $fileName = shift; # Full pathname if browser is IE
- # homegrown split because File::Spec functions will assume that directory path
- # is using / in UNIX and \ in Windows as defined in the HOST environment.
- # And we don't know the client OS. Problem is specific to IE which sends the full
- # original client path when you upload files. See Item2859 and Item2225 before
- # trying again to use File::Spec functions and remember to test with IE.
- # Cut path from filepath name (Windows '\' and Unix "/" format)
- my @pathz = ( split( /\\/, $fileName ) );
- my $filetemp = $pathz[$#pathz];
- my @pathza = ( split( '/', $filetemp ) );
- $filetemp = $pathza[$#pathza];
+ # Homegrown split equivalent because File::Spec functions will assume that
+ # directory path is using / in UNIX and \ in Windows as defined in the HOST
+ # environment. And we don't know the client OS. Problem is specific to IE
+ # which sends the full original client path when you upload files. See
+ # Item2859 and Item2225 before trying again to use File::Spec functions and
+ # remember to test with IE.
+ $fileName =~ s{[\\/]+$}{}; # Get rid of trailing slash/backslash (unlikely)
+ $fileName =~ s!^.*[\\/]!!; # Get rid of directory part
+ # FIXME: Should do filtering before untainting...
+ # untaint
+ $fileName = untaintUnchecked($fileName);
my $origName = $fileName;
- # Change spaces to underscore
- $fileName =~ s/ /_/go;
- # Strip dots and slashes at start
- # untaint at the same time
- $fileName =~ s/^([\.\/\\]*)*(.*?)$/$2/go;
- # If in iso8859 surroundings and Unicode::Normalize is available, let's get rid of 8-bit chars in filenames
- if ( defined $TWiki::cfg{Site}{CharSet} &&
- $TWiki::cfg{Site}{CharSet} =~ /^iso-?8859-?15?$/i ) {
- if( $] >= 5.008 && eval { require Unicode::Normalize } ) {
- require Encode;
- eval 'use Unicode::Normalize';
- # Some normalizations need to be intercepted early
- $fileName =~ s/\xc4/AE/g;
- $fileName =~ s/\xc5/AA/g;
- $fileName =~ s/\xd6/OE/g;
- $fileName =~ s/\xdc/UE/g;
- $fileName =~ s/\xe4/ae/g;
- $fileName =~ s/\xe5/aa/g;
- $fileName =~ s/\xf6/oe/g;
- $fileName =~ s/\xfc/ue/g;
- # convert to Unicode
- $fileName = NFD( $fileName ); # decompose (Unicode Normalization Form D)
- $fileName =~ s/\pM//g; # strip combining characters
- # normalizations, Latin-1
- $fileName =~ s/\x{00c6}/AE/g;
- $fileName =~ s/\x{00d8}/OE/g;
- $fileName =~ s/\x{00df}/ss/g;
- $fileName =~ s/\x{00e6}/ae/g;
- $fileName =~ s/\x{00f8}/oe/g;
- $fileName =~ s/\x{0152}/OE/g;
- $fileName =~ s/\x{0153}/ae/g;
- # clear everything left that is 8-bit
- $fileName =~ s/[^\0-\x80]//g;
- }
+ # WARNING - the code from here to ENDWARNING is NOT working yet - not sure
+ # why, spent some time on this but couldn't get it work (on Ubuntu Linux
+ # 5.10). Alternative approach would be good. Note that there's lots of
+ # debugging-only code still left in this section - the real working code
+ # would be just one s/// line.
+ $TWiki::THIS->writeDebug( "Filename now '$fileName' before regex");
+
+ if ( 1 ) {
+ require locale;
+ import locale ();
+
+ # Set environment variables for grep
+ $ENV{LC_CTYPE} = $TWiki::cfg{Site}{Locale};
+ $TWiki::THIS->writeDebug( "Site Locale is '$TWiki::cfg{Site}{Locale}'");
+
+ if ( exists $INC{'locale.pm'} ) {
+ $TWiki::THIS->writeDebug( "locale.pm loaded");
+ } else {
+ $TWiki::THIS->writeDebug( "locale.pm not loaded'");
+ }
+
+ # Load POSIX for I18N support.
+ require POSIX;
+ import POSIX qw( locale_h LC_CTYPE LC_COLLATE );
+
+ setlocale(&LC_CTYPE, $TWiki::cfg{Site}{Locale});
+ setlocale(&LC_COLLATE, $TWiki::cfg{Site}{Locale});
+
+ my $locale = setlocale(&LC_CTYPE);
+ $TWiki::THIS->writeDebug( "Perl Locale is '$locale'");
+
+ my $locale_bit = ($^H & 0x4) ? 1 : 0;
+ $TWiki::THIS->writeDebug( "Locale bit is $locale_bit" );
+
+ $TWiki::THIS->writeDebug( "Regex applied is '$TWiki::regex{filenameInvalidCharRegex}' ");
+
+ # NOTE: did get this working briefly using code not unlike that here,
+ # but then software rot set in.... Perhaps a Perl locale bug as setup
+ # looks exactly right?
+ $fileName =~ s/$TWiki::regex{filenameInvalidCharRegex}//g;
}
- # Remove problematic chars
+
+ # Remove non-filename characters - filtering-in of alphanumeric for security
+ # TODO: Make this configurable to off for Chinese/Japanese
+ $fileName = TWiki::cleanFilename( $fileName );
+ $TWiki::THIS->writeDebug( "Filename now '$fileName' after regex");
+
+ # ENDWARNING
+
+
+ # NOTE: This code below is filtering-out, should not be needed if the above 'WARNING' locale-based code worked.. However,
+ # using this filtering-out approach is really the only option unless/until the locale code works.
+ # Remove problematic chars (filtering-out)
$fileName =~ s/$TWiki::cfg{NameFilter}//goi;
+
# Append .txt to some files
$fileName =~ s/$TWiki::cfg{UploadFilter}/$1\.txt/goi;