From 764b85f5192eb60a3d8a02c3a93461adee03024b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tina=20M=C3=BCller?= Date: Fri, 24 Jan 2020 22:16:18 +0100 Subject: [PATCH] Fix some control character escaping and encoding issues See * #17 dump_string must not care about Perl's internal representation of a variable * #18 Dump() can create invalid YAML 1.1 --- examples/schema-perl.pm | 2 -- lib/YAML/PP.pm | 17 ++++++++++------ lib/YAML/PP/Emitter.pm | 39 +++++++++++++++++++++++++++++++----- lib/YAML/PP/Schema/Binary.pm | 13 +++++++++--- lib/YAML/PP/Schema/Perl.pm | 2 -- t/45.binary.t | 35 ++++++++++++++++++++------------ 6 files changed, 77 insertions(+), 31 deletions(-) diff --git a/examples/schema-perl.pm b/examples/schema-perl.pm index 147b7253..8da72a2a 100644 --- a/examples/schema-perl.pm +++ b/examples/schema-perl.pm @@ -75,7 +75,6 @@ EOM regexp => [ <<'EOM', my $string = 'unblessed'; - utf8::upgrade($string); qr{$string} EOM <<"EOM", @@ -88,7 +87,6 @@ EOM regexp_blessed => [ <<'EOM', my $string = 'blessed'; - utf8::upgrade($string); bless qr{$string}, "Foo" EOM <<"EOM", diff --git a/lib/YAML/PP.pm b/lib/YAML/PP.pm index 13a945ed..88f90ee1 100644 --- a/lib/YAML/PP.pm +++ b/lib/YAML/PP.pm @@ -579,14 +579,17 @@ The layout is like libyaml output: my $doc = $ypp->load_string("foo: bar"); my @docs = $ypp->load_string("foo: bar\n---\n- a"); -Input should be Unicode characters (decoded). +Input should be Unicode characters. + +So if you read from a file, you should decode it, for example with +C. =item load_file my $doc = $ypp->load_file("file.yaml"); my @docs = $ypp->load_file("file.yaml"); -Strings will be loaded as unicode characters (decoded). +Strings will be loaded as unicode characters. =item dump_string @@ -594,10 +597,13 @@ Strings will be loaded as unicode characters (decoded). my $yaml = $ypp->dump_string($doc1, $doc2); my $yaml = $ypp->dump_string(@docs); -Input strings should be Unicode characters. If not, they will be upgraded with +Input strings should be Unicode characters. C. -Output will return Unicode characters (decoded). +Output will return Unicode characters. + +So if you want to write that to a file (or pass to YAML::XS, for example), +you typically encode it via C. =item dump_file @@ -605,8 +611,7 @@ Output will return Unicode characters (decoded). $ypp->dump_file("file.yaml", $doc1, $doc2); $ypp->dump_file("file.yaml", @docs); -Input data should be UTF-8 decoded. If not, it will be upgraded with -C. +Input data should be Unicode characters. =item dump diff --git a/lib/YAML/PP/Emitter.pm b/lib/YAML/PP/Emitter.pm index 0c75cb0a..f7f640fa 100644 --- a/lib/YAML/PP/Emitter.pm +++ b/lib/YAML/PP/Emitter.pm @@ -335,13 +335,45 @@ my %control = ( "\x1d" => '\x1d', "\x1e" => '\x1e', "\x1f" => '\x1f', + "\x7f" => '\x7f', + "\x80" => '\x80', + "\x81" => '\x81', + "\x82" => '\x82', + "\x83" => '\x83', + "\x84" => '\x84', + "\x86" => '\x86', + "\x87" => '\x87', + "\x88" => '\x88', + "\x89" => '\x89', + "\x8a" => '\x8a', + "\x8b" => '\x8b', + "\x8c" => '\x8c', + "\x8d" => '\x8d', + "\x8e" => '\x8e', + "\x8f" => '\x8f', + "\x90" => '\x90', + "\x91" => '\x91', + "\x92" => '\x92', + "\x93" => '\x93', + "\x94" => '\x94', + "\x95" => '\x95', + "\x96" => '\x96', + "\x97" => '\x97', + "\x98" => '\x98', + "\x99" => '\x99', + "\x9a" => '\x9a', + "\x9b" => '\x9b', + "\x9c" => '\x9c', + "\x9d" => '\x9d', + "\x9e" => '\x9e', + "\x9f" => '\x9f', "\x{2029}" => '\P', "\x{2028}" => '\L', "\x85" => '\N', "\xa0" => '\_', ); -my $control_re = '\x00-\x08\x0b\x0c\x0e-\x1f\x{2029}\x{2028}\x85\xa0'; +my $control_re = '\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x84\x86-\x9f\x{d800}-\x{dfff}\x{fffe}\x{ffff}\x{2028}\x{2029}\x85\xa0'; my %to_escape = ( "\n" => '\n', "\t" => '\t', @@ -361,9 +393,6 @@ sub scalar_event { my $last = $stack->[-1]; my $indent = $last->{indent}; my $value = $info->{value}; - unless (utf8::is_utf8($value)) { - utf8::upgrade($value); - } my $props = ''; my $anchor = $info->{anchor}; @@ -506,7 +535,7 @@ sub scalar_event { } } else { - $value =~ s/([$escape_re"\\])/$to_escape{ $1 }/g; + $value =~ s/([$escape_re"\\])/$to_escape{ $1 } || sprintf '\\u%04x', ord($1)/eg; $value = '"' . $value . '"'; } diff --git a/lib/YAML/PP/Schema/Binary.pm b/lib/YAML/PP/Schema/Binary.pm index 360d30f9..1561295b 100644 --- a/lib/YAML/PP/Schema/Binary.pm +++ b/lib/YAML/PP/Schema/Binary.pm @@ -63,13 +63,18 @@ YAML::PP::Schema::Binary - Schema for loading and binary data my $yp = YAML::PP->new( schema => [qw/ JSON Binary /] ); # or - my $binary = $yp->load_string(<<'EOM'); + my ($binary, $same_binary) = $yp->load_string(<<'EOM'); # The binary value a tiny arrow encoded as a gif image. --- !!binary "\ R0lGODlhDAAMAIQAAP//9/X17unp5WZmZgAAAOfn515eXvPz7Y6OjuDg4J+fn5\ OTk6enp56enmlpaWNjY6Ojo4SEhP/++f/++f/++f/++f/++f/++f/++f/++f/+\ +f/++f/++f/++f/++f/++SH+Dk1hZGUgd2l0aCBHSU1QACwAAAAADAAMAAAFLC\ AgjoEwnuNAFOhpEMTRiggcz4BNJHrv/zCFcLiwMWYNG84BwwEeECcgggoBADs=" + --- !!binary | + R0lGODlhDAAMAIQAAP//9/X17unp5WZmZgAAAOfn515eXvPz7Y6OjuDg4J+fn5 + OTk6enp56enmlpaWNjY6Ojo4SEhP/++f/++f/++f/++f/++f/++f/++f/++f/+ + +f/++f/++f/++f/++f/++SH+Dk1hZGUgd2l0aCBHSU1QACwAAAAADAAMAAAFLC + AgjoEwnuNAFOhpEMTRiggcz4BNJHrv/zCFcLiwMWYNG84BwwEeECcgggoBADs= EOM =head1 DESCRIPTION @@ -77,8 +82,10 @@ YAML::PP::Schema::Binary - Schema for loading and binary data By prepending a base64 encoded binary string with the C tag, it can be automatically decoded when loading. -If you are using this schema, any string containing C<[\x{7F}-\x{10FFFF}]> -will be dumped as binary. That also includes encoded utf8. +Note that the logic for dumping is probably broken, see +L. + +Suggestions welcome. =head1 METHODS diff --git a/lib/YAML/PP/Schema/Perl.pm b/lib/YAML/PP/Schema/Perl.pm index 33b824e1..9cc3aecd 100644 --- a/lib/YAML/PP/Schema/Perl.pm +++ b/lib/YAML/PP/Schema/Perl.pm @@ -715,7 +715,6 @@ YAML: # Code my $string = 'unblessed'; - utf8::upgrade($string); qr{$string} @@ -727,7 +726,6 @@ YAML: # Code my $string = 'blessed'; - utf8::upgrade($string); bless qr{$string}, "Foo" diff --git a/t/45.binary.t b/t/45.binary.t index 840ebdac..caed5696 100644 --- a/t/45.binary.t +++ b/t/45.binary.t @@ -59,35 +59,38 @@ my @tests = ( [binary => "\xE0\x83\xBF"], [binary => "\xF0\x80\x83\xBF"], [binary => "\xF0\x80\xA3\x80"], - [binary => $gif,], + [binary => [$gif, decode_utf8("รค")],], + [binary => [$gif, 'foo'],], ); subtest roundtrip => sub { for my $item (@tests) { select undef, undef, undef, 0.1; - my ($type, $string) = @$item; + my ($type, $data) = @$item; local $Data::Dumper::Useqq = 1; - my $label = Data::Dumper->Dump([$string], ['string']); + my $label = Data::Dumper->Dump([$data], ['data']); chomp $label; note("\n\n\n=============== $type: $label"); - my $dump = $yp->dump_string($string); + my $dump = $yp->dump_string($data); + #note("========= YAML:\n$dump"); my $reload = $yp->load_string($dump); - if ($type eq 'binary') { - if (utf8::is_utf8($reload)) { - utf8::downgrade($reload); - } + if (ref $reload eq 'ARRAY') { + cmp_ok($reload->[0], 'eq', $data->[0], "Reload binary->[0] ok ($label)"); + cmp_ok($reload->[1], 'eq', $data->[1], "Reload binary->[1] ok ($label)"); + } + else { + cmp_ok($reload, 'eq', $data, "Reload binary ok ($label)"); } - cmp_ok($reload, 'eq', $string, "Reload binary ok ($label)"); } }; subtest roundtrip_binary => sub { for my $item (@tests) { - my ($type, $string) = @$item; + my ($type, $data) = @$item; local $Data::Dumper::Useqq = 1; - my $label = Data::Dumper->Dump([$string], ['string']); + my $label = Data::Dumper->Dump([$data], ['data']); note("=============== $type: $label"); - my $dump = $yp_binary->dump_string($string); + my $dump = $yp_binary->dump_string($data); if ($type eq 'binary') { like($dump, qr{!!binary}, "Output YAML contains !!binary"); } @@ -95,7 +98,13 @@ subtest roundtrip_binary => sub { unlike($dump, qr{!!binary}, "Output YAML does not contain !!binary"); } my $reload = $yp_binary->load_string($dump); - cmp_ok($reload, 'eq', $string, "Reload binary ok ($label)"); + if (ref $reload eq 'ARRAY') { + cmp_ok($reload->[0], 'eq', $data->[0], "Reload binary->[0] ok ($label)"); + cmp_ok($reload->[1], 'eq', $data->[1], "Reload binary->[1] ok ($label)"); + } + else { + cmp_ok($reload, 'eq', $data, "Reload binary ok ($label)"); + } } };