Index: lib/TWiki.pm =================================================================== --- lib/TWiki.pm (revision 14213) +++ lib/TWiki.pm (working copy) @@ -219,7 +219,7 @@ ICONURLPATH => \&ICONURLPATH, IF => \&IF, INCLUDE => \&INCLUDE, - INTURLENCODE => \&INTURLENCODE, + INTURLENCODE => \&INTURLENCODE_deprecated, LANGUAGES => \&LANGUAGES, MAKETEXT => \&MAKETEXT, META => \&META, @@ -341,12 +341,13 @@ # Load POSIX for I18N support. require POSIX; - import POSIX qw( locale_h LC_CTYPE ); + import POSIX qw( locale_h LC_CTYPE LC_COLLATE ); # SMELL: mod_perl compatibility note: If TWiki is running under Apache, # won't this play with the Apache process's locale settings too? # What effects would this have? setlocale(&LC_CTYPE, $TWiki::cfg{Site}{Locale}); + setlocale(&LC_COLLATE, $TWiki::cfg{Site}{Locale}); } $functionTags{CHARSET} = sub { $TWiki::cfg{Site}{CharSet} || @@ -420,8 +421,10 @@ # characters allowed $regex{emailAddrRegex} = qr/([A-Za-z0-9\.\+\-\_]+\@[A-Za-z0-9\.\-]+)/; - # Filename regex, for attachments - $regex{filenameRegex} = qr/[$regex{mixedAlphaNum}\.]+/o; + # Filename regex to filter out invalid characters in attachments - allow + # I18N characters, spaces, etc + $regex{filenameInvalidCharRegex} = qr/[^$regex{mixedAlphaNum}\. _-]/o; + # $regex{filenameInvalidCharRegex} = "[^$regex{mixedAlphaNum}\. _-]"; # Multi-character alpha-based regexes $regex{mixedAlphaNumRegex} = qr/[$regex{mixedAlphaNum}]*/o; @@ -532,17 +535,18 @@ if ( $TWiki::cfg{Site}{CharSet} =~ /^utf-?8$/i ) { # warn if using Perl older than 5.8 if( $] < 5.008 ) { - $this->writeWarning( 'UTF-8 not supported on Perl '.$]. + $this->writeWarning( 'UTF-8 not remotely supported on Perl '.$]. ' - use Perl 5.8 or higher..' ); } - # SMELL: is this true yet? + # We still don't have Codev.UnicodeSupport $this->writeWarning( 'UTF-8 not yet supported as site charset -'. 'TWiki is likely to have problems' ); return $text; } - # Convert into ISO-8859-1 if it is the site charset + # Convert into ISO-8859-1 if it is the site charset. This conversion + # is *not valid for ISO-8859-15*. if ( $TWiki::cfg{Site}{CharSet} =~ /^iso-?8859-?1$/i ) { # ISO-8859-1 maps onto first 256 codepoints of Unicode # (conversion from 'perldoc perluniintro') @@ -629,9 +633,9 @@ } unless( $this->inContext('command_line')) { - # can't use simple length() in case we have UNICODE - # see perldoc -f length - my $len = do { use bytes; length( $text ); }; + # FIXME: Defer next line until we have Codev.UnicodeSupport - too 5.8 dependent + # my $len = do { use bytes; length( $text ); }; + my $len = length( $text ); $this->writePageHeader( undef, $pageType, $contentType, $len ); } print $text; @@ -955,6 +959,50 @@ =pod +=pod + +---++ StaticMethod cleanFilename( $filename ) -> $cleanedFilename + +Clean up attachment filename, including only alphanumeric filename +characters (including I18N characters). Located in TWiki.pm because +it's hard to get I18N regexes to work in Sandbox.pm. + +=cut + +sub cleanFilename { + my( $filename ) = @_; + + my $this = $TWiki::THIS; + + if ( $TWiki::cfg{UseLocale} ) { + # Set environment variables for grep + $ENV{LC_CTYPE} = $TWiki::cfg{Site}{Locale}; + + require POSIX; + import POSIX qw( locale_h LC_CTYPE LC_COLLATE ); + + setlocale(&LC_CTYPE, $TWiki::cfg{Site}{Locale}); + setlocale(&LC_COLLATE, $TWiki::cfg{Site}{Locale}); + } + if ( exists $INC{'locale.pm'} ) { + $TWiki::THIS->writeDebug( "locale.pm loaded'"); + } else { + $TWiki::THIS->writeDebug( "locale.pm not loaded'"); + } + + $this->writeDebug( "Regex applied is '$TWiki::regex{filenameInvalidCharRegex}' "); + + # TODO: Make this configurable to no validation for Chinese/Japanese + $filename =~ s/$regex{filenameInvalidCharRegex}//g; + return $filename; +} + + +=pod + + +=pod + ---++ StaticMethod isValidWebName( $name, $system ) -> $boolean STATIC Check for a valid web name. If $system is true, then @@ -1158,8 +1206,14 @@ $this->normalizeWebTopicName( $web, $topic ); my $path = '/'.$web.'/'.$topic; - $path .= '/'.$attachment if $attachment; - $url .= urlEncode( $path ); + if( $attachment ) { + $path .= '/'.$attachment; + # Attachments are served directly by web server, need to handle + # URL encoding specially + $url .= urlEncodeAttachment ( $path ); + } else { + $url .= urlEncode( $path ); + } } return $url; @@ -1264,6 +1318,9 @@ =cut +# RD: HACK for debugging +our $THIS; + sub new { my( $class, $login, $query, $initialContext ) = @_; @@ -1271,6 +1328,8 @@ $query ||= new CGI( {} ); my $this = bless( {}, $class ); + $THIS = $this; # RD: HACK for debugging + $this->{_HTMLHEADERS} = {}; $this->{context} = $initialContext || {}; @@ -1359,9 +1418,9 @@ $web = $TWiki::cfg{UsersWebName} unless $web; $this->{webName} = TWiki::Sandbox::untaintUnchecked( $web ); - # Convert UTF-8 web and topic name from URL into site charset - # if necessary - no effect if URL is not in UTF-8 - # handle topic and web names seperately; encoding is not necessarily shared + # Convert UTF-8 web and topic name from URL into site charset if necessary + # SMELL: merge these two cases, browsers just don't mix two encodings in one URL + # - can also simplify into 2 lines by making function return unprocessed text if no conversion my $webNameTemp = $this->UTF82SiteCharSet( $this->{webName} ); if ( $webNameTemp ) { $this->{webName} = $webNameTemp; @@ -1374,6 +1433,7 @@ # Item3270 - here's the appropriate place to enforce TWiki spec: # All topic name sources are evaluated, site charset applied + # SMELL: This untaint unchecked is duplicate of one just above $this->{topicName} = TWiki::Sandbox::untaintUnchecked(ucfirst $this->{topicName}); @@ -2185,6 +2245,47 @@ =pod +---++ StaticMethod urlEncodeAttachment ( $text ) + +For attachments, URL-encode specially to 'freeze' any characters >127 in the +site charset (e.g. ISO-8859-1 or KOI8-R), by doing URL encoding into native +charset ($siteCharset) - used when generating attachment URLs, to enable the +web server to serve attachments, including images, directly. + +This encoding is required to handle the cases of: + + - browsers that generate UTF-8 URLs automatically from site charset URLs - now quite common + - web servers that directly serve attachments, using the site charset for + filenames, and cannot convert UTF-8 URLs into site charset filenames + +The aim is to prevent the browser from converting a site charset URL in the web +page to a UTF-8 URL, which is the default. Hence we 'freeze' the URL into the +site character set through URL encoding. + +In two cases, no URL encoding is needed: For EBCDIC mainframes, we assume that +site charset URLs will be translated (outbound and inbound) by the web server to/from an +EBCDIC character set. For sites running in UTF-8, there's no need for TWiki to +do anything since all URLs and attachment filenames are already in UTF-8. + +=cut + +sub urlEncodeAttachment { + my( $text ) = @_; + + my $usingEBCDIC = ( 'A' eq chr(193) ); # Only true on EBCDIC mainframes + + if( $TWiki::cfg{Site}{CharSet} eq "utf-8" or $usingEBCDIC ) { + # Just let browser do UTF-8 URL encoding + return $text; + } + + # Freeze into site charset through URL encoding + return urlEncode( $text ); +} + + +=pod + ---++ StaticMethod urlEncode( $string ) -> encoded string Encode by converting characters that are illegal in URLs to @@ -2194,18 +2295,22 @@ characters such as = and ?. RFC 1738, Dec. '94: -> -...Only alphanumerics [0-9a-zA-Z], the special -characters $-_.+!*'(), and reserved characters used for their -reserved purposes may be used unencoded within a URL. - + + ...Only alphanumerics [0-9a-zA-Z], the special + characters $-_.+!*'(), and reserved characters used for their + reserved purposes may be used unencoded within a URL. + + Reserved characters are $&+,/:;=?@ - these are _also_ encoded by this method. -SMELL: For non-ISO-8859-1 $TWiki::cfg{Site}{CharSet}, need to convert to -UTF-8 before URL encoding. This encoding only supports 8-bit -character codes. +This URL-encoding handles all character encodings including ISO-8859-*, +KOI8-R, EUC-* and UTF-8. +This may not handle EBCDIC properly, as it generates an EBCDIC URL-encoded +URL, but mainframe web servers seem to translate this outbound before it hits browser +- see CGI::Util::escape for another approach. + =cut sub urlEncode { @@ -3367,7 +3472,7 @@ # TWiki Feb2003 release - encoding is no longer needed since UTF-URLs are now # directly supported, but it is provided for backward compatibility with # skins that may still be using the deprecated %INTURLENCODE%. -sub INTURLENCODE { +sub INTURLENCODE_deprecated { my( $this, $params ) = @_; # Just strip double quotes, no URL encoding - Mozilla UTF-8 URLs # directly supported now Index: lib/TWiki/Attach.pm =================================================================== --- lib/TWiki/Attach.pm (revision 14213) +++ lib/TWiki/Attach.pm (working copy) @@ -275,6 +275,8 @@ my $imgSize = ''; my $prefs = $this->{session}->{prefs}; my $store = $this->{session}->{store}; + # I18N: URL-encode the attachment filename + my $fileURL = TWiki::urlEncodeAttachment( $attName ); if( $attName =~ /\.(gif|jpg|jpeg|png)$/i ) { # inline image @@ -296,7 +298,7 @@ $fileLink = $prefs->getPreferencesValue( 'ATTACHEDIMAGEFORMAT' ); unless( $fileLink ) { - push( @attrs, src=>"%ATTACHURLPATH%/$attName" ); + push( @attrs, src=>"%ATTACHURLPATH%/$fileURL" ); push( @attrs, alt=>$attName ); return " * $fileComment: ".CGI::br().CGI::img({ @attrs }); } @@ -304,11 +306,15 @@ # normal attached file $fileLink = $prefs->getPreferencesValue( 'ATTACHEDFILELINKFORMAT' ); unless( $fileLink ) { - return " * [[%ATTACHURL%/$attName][$attName]]: $fileComment"; + return " * [[%ATTACHURL%/$fileURL][$attName]]: $fileComment"; } } - $fileLink =~ s/\$name/$attName/g; + # I18N: Site specified %ATTACHEDIMAGEFORMAT% or %ATTACHEDFILELINKFORMAT%, + # ensure that filename is URL encoded - first $name must be URL. + $fileLink =~ s/\$name/$fileURL/; + $fileLink =~ s/\$name/$attName/; + $fileLink =~ s/\$comment/$fileComment/g; $fileLink =~ s/\$size/$imgSize/g; $fileLink =~ s/\\t/\t/go; @@ -328,8 +334,7 @@ my( $x, $y) = ( 0, 0 ); if( defined( $file ) ) { - # for crappy MS OSes - Win/Dos/NT use is NOT SUPPORTED - binmode( $file ); + binmode( $file ); # For Windows my $s; return ( 0, 0 ) unless ( read( $file, $s, 4 ) == 4 ); seek( $file, 0, 0 ); Index: lib/TWiki/Store/RcsFile.pm =================================================================== --- lib/TWiki/Store/RcsFile.pm (revision 14213) +++ lib/TWiki/Store/RcsFile.pm (working copy) @@ -59,18 +59,14 @@ my( $class, $session, $web, $topic, $attachment ) = @_; my $this = bless( { session => $session }, $class ); - utf8::downgrade( $web ) if( $web && $] >= 5.008 ); $this->{web} = $web; if( $topic ) { my $rcsSubDir = ( $TWiki::cfg{RCS}{useSubDir} ? '/RCS' : '' ); - utf8::downgrade( $topic ) if( $] >= 5.008 ); $this->{topic} = $topic; if( $attachment ) { - utf8::downgrade( $attachment ) if( $] >= 5.008 ); - $this->{attachment} = $attachment; $this->{file} = $TWiki::cfg{PubDir}.'/'.$web.'/'. Index: lib/TWiki/Configure/Checkers/Site/Locale.pm =================================================================== --- lib/TWiki/Configure/Checkers/Site/Locale.pm (revision 14213) +++ lib/TWiki/Configure/Checkers/Site/Locale.pm (working copy) @@ -56,7 +56,7 @@ # Set the default site charset unless( defined( $TWiki::cfg{Site}{CharSet}) ) { - $TWiki::cfg{Site}{CharSet} = 'iso-8859-15'; + $TWiki::cfg{Site}{CharSet} = 'iso-8859-1'; } # Extract the default site language - ignores '@euro' part of Index: lib/TWiki/Sandbox.pm =================================================================== --- lib/TWiki/Sandbox.pm (revision 14213) +++ lib/TWiki/Sandbox.pm (working copy) @@ -35,6 +35,14 @@ use Error qw( :try ); use File::Spec; +BEGIN { + # Do a dynamic 'use locale' for this module + if( $TWiki::cfg{UseLocale} ) { + require locale; + import locale(); + } +} + # TODO: Sandbox module should probably use custom 'die' handler so that # output goes only to web server error log - otherwise it might give # useful debugging information to someone developing an exploit. @@ -180,58 +188,77 @@ =cut sub sanitizeAttachmentName { - my $fileName = shift; + my $fileName = shift; # Full pathname if browser is IE - # homegrown split because File::Spec functions will assume that directory path - # is using / in UNIX and \ in Windows as defined in the HOST environment. - # And we don't know the client OS. Problem is specific to IE which sends the full - # original client path when you upload files. See Item2859 and Item2225 before - # trying again to use File::Spec functions and remember to test with IE. - # Cut path from filepath name (Windows '\' and Unix "/" format) - my @pathz = ( split( /\\/, $fileName ) ); - my $filetemp = $pathz[$#pathz]; - my @pathza = ( split( '/', $filetemp ) ); - $filetemp = $pathza[$#pathza]; + # Homegrown split equivalent because File::Spec functions will assume that + # directory path is using / in UNIX and \ in Windows as defined in the HOST + # environment. And we don't know the client OS. Problem is specific to IE + # which sends the full original client path when you upload files. See + # Item2859 and Item2225 before trying again to use File::Spec functions and + # remember to test with IE. + $fileName =~ s{[\\/]+$}{}; # Get rid of trailing slash/backslash (unlikely) + $fileName =~ s!^.*[\\/]!!; # Get rid of directory part + # FIXME: Should do filtering before untainting... + # untaint + $fileName = untaintUnchecked($fileName); my $origName = $fileName; - # Change spaces to underscore - $fileName =~ s/ /_/go; - # Strip dots and slashes at start - # untaint at the same time - $fileName =~ s/^([\.\/\\]*)*(.*?)$/$2/go; - # If in iso8859 surroundings and Unicode::Normalize is available, let's get rid of 8-bit chars in filenames - if ( defined $TWiki::cfg{Site}{CharSet} && - $TWiki::cfg{Site}{CharSet} =~ /^iso-?8859-?15?$/i ) { - if( $] >= 5.008 && eval { require Unicode::Normalize } ) { - require Encode; - eval 'use Unicode::Normalize'; - # Some normalizations need to be intercepted early - $fileName =~ s/\xc4/AE/g; - $fileName =~ s/\xc5/AA/g; - $fileName =~ s/\xd6/OE/g; - $fileName =~ s/\xdc/UE/g; - $fileName =~ s/\xe4/ae/g; - $fileName =~ s/\xe5/aa/g; - $fileName =~ s/\xf6/oe/g; - $fileName =~ s/\xfc/ue/g; - # convert to Unicode - $fileName = NFD( $fileName ); # decompose (Unicode Normalization Form D) - $fileName =~ s/\pM//g; # strip combining characters - # normalizations, Latin-1 - $fileName =~ s/\x{00c6}/AE/g; - $fileName =~ s/\x{00d8}/OE/g; - $fileName =~ s/\x{00df}/ss/g; - $fileName =~ s/\x{00e6}/ae/g; - $fileName =~ s/\x{00f8}/oe/g; - $fileName =~ s/\x{0152}/OE/g; - $fileName =~ s/\x{0153}/ae/g; - # clear everything left that is 8-bit - $fileName =~ s/[^\0-\x80]//g; - } + # WARNING - the code from here to ENDWARNING is NOT working yet - not sure + # why, spent some time on this but couldn't get it work (on Ubuntu Linux + # 5.10). Alternative approach would be good. Note that there's lots of + # debugging-only code still left in this section - the real working code + # would be just one s/// line. + $TWiki::THIS->writeDebug( "Filename now '$fileName' before regex"); + + if ( 1 ) { + require locale; + import locale (); + + # Set environment variables for grep + $ENV{LC_CTYPE} = $TWiki::cfg{Site}{Locale}; + $TWiki::THIS->writeDebug( "Site Locale is '$TWiki::cfg{Site}{Locale}'"); + + if ( exists $INC{'locale.pm'} ) { + $TWiki::THIS->writeDebug( "locale.pm loaded"); + } else { + $TWiki::THIS->writeDebug( "locale.pm not loaded'"); + } + + # Load POSIX for I18N support. + require POSIX; + import POSIX qw( locale_h LC_CTYPE LC_COLLATE ); + + setlocale(&LC_CTYPE, $TWiki::cfg{Site}{Locale}); + setlocale(&LC_COLLATE, $TWiki::cfg{Site}{Locale}); + + my $locale = setlocale(&LC_CTYPE); + $TWiki::THIS->writeDebug( "Perl Locale is '$locale'"); + + my $locale_bit = ($^H & 0x4) ? 1 : 0; + $TWiki::THIS->writeDebug( "Locale bit is $locale_bit" ); + + $TWiki::THIS->writeDebug( "Regex applied is '$TWiki::regex{filenameInvalidCharRegex}' "); + + # NOTE: did get this working briefly using code not unlike that here, + # but then software rot set in.... Perhaps a Perl locale bug as setup + # looks exactly right? + $fileName =~ s/$TWiki::regex{filenameInvalidCharRegex}//g; } - # Remove problematic chars + + # Remove non-filename characters - filtering-in of alphanumeric for security + # TODO: Make this configurable to off for Chinese/Japanese + $fileName = TWiki::cleanFilename( $fileName ); + $TWiki::THIS->writeDebug( "Filename now '$fileName' after regex"); + + # ENDWARNING + + + # NOTE: This code below is filtering-out, should not be needed if the above 'WARNING' locale-based code worked.. However, + # using this filtering-out approach is really the only option unless/until the locale code works. + # Remove problematic chars (filtering-out) $fileName =~ s/$TWiki::cfg{NameFilter}//goi; + # Append .txt to some files $fileName =~ s/$TWiki::cfg{UploadFilter}/$1\.txt/goi;