@@ -2,7 +2,7 @@ package CSS::Inliner;
2
2
use strict;
3
3
use warnings;
4
4
5
- our $VERSION = ' 4001 ' ;
5
+ our $VERSION = ' 4002 ' ;
6
6
7
7
use Carp;
8
8
use Encode;
@@ -39,7 +39,7 @@ support top level <style> declarations.
39
39
=cut
40
40
41
41
BEGIN {
42
- my $members = [' stylesheet' ,' css' ,' html' ,' html_tree' ,' query' ,' strip_attrs' ,' relaxed' ,' leave_style' ,' warns_as_errors' ,' content_warnings' ,' agent' ,' charset ' ];
42
+ my $members = [' stylesheet' ,' css' ,' html' ,' html_tree' ,' query' ,' strip_attrs' ,' relaxed' ,' leave_style' ,' warns_as_errors' ,' content_warnings' ,' agent' ,' fixlatin ' ];
43
43
44
44
# generate all the getter/setter we need
45
45
foreach my $member (@{$members }) {
@@ -109,7 +109,7 @@ sub new {
109
109
leave_style => (defined ($$params {leave_style }) && $$params {leave_style }) ? 1 : 0,
110
110
warns_as_errors => (defined ($$params {warns_as_errors }) && $$params {warns_as_errors }) ? 1 : 0,
111
111
agent => (defined ($$params {agent }) && $$params {agent }) ? $$params {agent } : ' Mozilla/4.0' ,
112
- charset => undef
112
+ fixlatin => eval { require Encoding::FixLatin; return 1; } ? 1 : 0
113
113
};
114
114
115
115
bless $self , $class ;
@@ -122,7 +122,7 @@ sub new {
122
122
=head2 fetch_file
123
123
124
124
Fetches a remote HTML file that supposedly contains both HTML and a
125
- style declaration, properly tags the data with the proper characterset
125
+ style declaration, properly tags the data with the proper charset
126
126
as provided by the remote webserver (if any). Subsequently calls the
127
127
read method automatically.
128
128
@@ -158,7 +158,21 @@ sub fetch_file {
158
158
159
159
my $charset = $self -> detect_charset({ content => $content , charset => $$params {charset }, ctcharset => $ctcharset });
160
160
161
- my $decoded_html = $self -> decode_characters({ content => $content , charset => $charset });
161
+ my $decoded_html ;
162
+ if ($charset ) {
163
+ $decoded_html = $self -> decode_characters({ content => $content , charset => $charset });
164
+ }
165
+ else {
166
+ # no good hints found, do the best we can
167
+
168
+ if ($self -> _fixlatin()) {
169
+ Encoding::FixLatin-> import (' fix_latin' );
170
+ $decoded_html = fix_latin($content );
171
+ }
172
+ else {
173
+ $decoded_html = $self -> decode_characters({ content => $content , charset => ' ascii' });
174
+ }
175
+ }
162
176
163
177
my $html = $self -> _absolutize_references({ content => $decoded_html , baseref => $baseref });
164
178
@@ -170,8 +184,8 @@ sub fetch_file {
170
184
=head2 read_file
171
185
172
186
Opens and reads an HTML file that supposedly contains both HTML and a
173
- style declaration. It subsequently calls the read() method
174
- automatically.
187
+ style declaration, properly tags the data with the proper charset
188
+ if specified. It subsequently calls the read() method automatically.
175
189
176
190
This method requires you to pass in a params hash that contains a
177
191
filename argument. For example:
@@ -203,7 +217,21 @@ sub read_file {
203
217
204
218
my $charset = $self -> detect_charset({ content => $content , charset => $$params {charset } });
205
219
206
- my $decoded_html = $self -> decode_characters({ content => $content , charset => $charset });
220
+ my $decoded_html ;
221
+ if ($charset ) {
222
+ $decoded_html = $self -> decode_characters({ content => $content , charset => $charset });
223
+ }
224
+ else {
225
+ # no good hints found, do the best we can
226
+
227
+ if ($self -> _fixlatin()) {
228
+ Encoding::FixLatin-> import (' fix_latin' );
229
+ $decoded_html = fix_latin($content );
230
+ }
231
+ else {
232
+ $decoded_html = $self -> decode_characters({ content => $content , charset => ' ascii' });
233
+ }
234
+ }
207
235
208
236
$self -> read ({ html => $decoded_html , charset => $charset });
209
237
@@ -249,7 +277,6 @@ sub read {
249
277
$self -> _html_tree-> parse_content($$params {html });
250
278
251
279
$self -> _init_query();
252
- $self -> _charset($$params {charset });
253
280
254
281
# suck in the styles for later use from the head section - stylesheets anywhere else are invalid
255
282
my $stylesheet = $self -> _parse_stylesheet();
@@ -270,6 +297,11 @@ which lays out a recommendation for determining the character set of a received
270
297
can be seen here under the "determining the character encoding" section:
271
298
http://www.w3.org/TR/html5/syntax.html
272
299
300
+ NOTE: In the event that no charset can be identified the library will handle the content as a mix of
301
+ UTF-8/CP-1252/8859-1/ASCII by attempting to use the Encoding::FixLatin module, as this combination
302
+ is relatively common in the wild. Finally, if Encoding::FixLatin is unavailable the content will be
303
+ treated as ASCII.
304
+
273
305
Input Parameters:
274
306
content - scalar presumably containing both html and css
275
307
charset - (optional) programmer specified charset for the passed content
@@ -304,8 +336,7 @@ sub detect_charset {
304
336
$charset = $meta_charset ;
305
337
}
306
338
else {
307
- # no hints found, assume ascii until support for additional steps from the working group is added
308
- $charset = ' ascii' ;
339
+ # no hints found...
309
340
}
310
341
}
311
342
@@ -938,21 +969,23 @@ sub _extract_meta_charset {
938
969
939
970
my $head = $doc -> look_down(" _tag" , " head" ); # there should only be one
940
971
941
- # pull key header meta elements
942
- my $meta_charset_elem = $head -> look_down(' _tag' ,' meta' ,' charset' ,qr / ./ );
943
- my $meta_equiv_charset_elem = $head -> look_down(' _tag' ,' meta' ,' http-equiv' ,qr / content-type/ i ,' content' ,qr / ./ );
944
-
945
- # assign meta charset, we give precedence to meta http_equiv content type
946
972
my $meta_charset ;
947
- if ($meta_equiv_charset_elem ) {
948
- my $meta_equiv_content = $meta_equiv_charset_elem -> attr(' content' );
973
+ if ($head ) {
974
+ # pull key header meta elements
975
+ my $meta_charset_elem = $head -> look_down(' _tag' ,' meta' ,' charset' ,qr / ./ );
976
+ my $meta_equiv_charset_elem = $head -> look_down(' _tag' ,' meta' ,' http-equiv' ,qr / content-type/ i ,' content' ,qr / ./ );
977
+
978
+ # assign meta charset, we give precedence to meta http_equiv content type
979
+ if ($meta_equiv_charset_elem ) {
980
+ my $meta_equiv_content = $meta_equiv_charset_elem -> attr(' content' );
949
981
950
- if ($meta_equiv_content =~ / charset=(.*)(?:[";,]?)/i ) {
951
- $meta_charset = $1 ;
982
+ if ($meta_equiv_content =~ / charset=(.*)(?:[";,]?)/i ) {
983
+ $meta_charset = $1 ;
984
+ }
985
+ }
986
+ elsif ($meta_charset_elem ) {
987
+ $meta_charset = $meta_charset_elem -> attr(' charset' );
952
988
}
953
- }
954
- elsif ($meta_charset_elem ) {
955
- $meta_charset = $meta_charset_elem -> attr(' charset' );
956
989
}
957
990
958
991
return $meta_charset ;
0 commit comments