Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use pdfScanner in ReadVerticalDisplacements and fix #693 and return 0… #928

Merged
merged 1 commit into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
30 changes: 30 additions & 0 deletions src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,36 @@

public class GithubIssuesTests
{
[Fact]
public void Issue693()
{
var doc = IntegrationHelpers.GetDocumentPath("reference-2-numeric-error.pdf");

using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = true, SkipMissingFonts = true }))
{
var page1 = document.GetPage(1);
Assert.Equal(1269, page1.Letters.Count);
}
}

[Fact]
public void Issue692()
{
var doc = IntegrationHelpers.GetDocumentPath("cmap-parsing-exception.pdf");

using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = true, SkipMissingFonts = true }))
{
var page1 = document.GetPage(1);
Assert.Equal(796, page1.Letters.Count);
}

using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = false, SkipMissingFonts = false }))
{
var ex = Assert.Throws<InvalidOperationException>(() => document.GetPage(1));
Assert.StartsWith("Read byte called on input bytes which was at end of byte set.", ex.Message);
}
}

[Fact]
public void Issue874()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ public class IntegrationDocumentTests
[
"issue_671.pdf",
"GHOSTSCRIPT-698363-0.pdf",
"ErcotFacts.pdf"
"ErcotFacts.pdf",
"cmap-parsing-exception.pdf"
];

[Theory]
Expand Down
2 changes: 1 addition & 1 deletion src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ private static PdfDocument OpenDocument(
cidFontFactory,
filterProvider,
pdfScanner,
parsingOptions.Logger);
parsingOptions);

var type1Handler = new Type1FontHandler(pdfScanner, filterProvider, encodingReader);

Expand Down
5 changes: 2 additions & 3 deletions src/UglyToad.PdfPig/PdfExtensions.cs
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
namespace UglyToad.PdfPig
{
using System;
using System.Collections.Generic;
using System;
using System.Diagnostics.CodeAnalysis;
using Core;
using Filters;
using Parser.Parts;
using Parser.Parts;
using Tokenization.Scanner;
using Tokens;

Expand Down
18 changes: 11 additions & 7 deletions src/UglyToad.PdfPig/PdfFonts/Cmap/CMap.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
/// The CMap (character code map) maps character codes to character identifiers (CIDs).
/// The set of characters which a CMap refers to is the "character set" (charset).
/// </summary>
internal class CMap
internal sealed class CMap
{
public CharacterIdentifierSystemInfo Info { get; }

Expand Down Expand Up @@ -140,13 +140,12 @@ public int ConvertToCid(int code)
return 0;
}


public override string ToString()
{
return Name;
}

public int ReadCode(IInputBytes bytes)
public int ReadCode(IInputBytes bytes, bool useLenientParsing)
{
if (hasEmptyCodespace)
{
Expand All @@ -166,7 +165,7 @@ public int ReadCode(IInputBytes bytes)
break;
}

result[i] = ReadByte(bytes);
result[i] = ReadByte(bytes, useLenientParsing);
}

for (int i = minCodeLength - 1; i < maxCodeLength; i++)
Expand All @@ -181,17 +180,23 @@ public int ReadCode(IInputBytes bytes)
}
if (byteCount < maxCodeLength)
{
result[byteCount] = ReadByte(bytes);
result[byteCount] = ReadByte(bytes, useLenientParsing);
}
}

throw new PdfDocumentFormatException($"CMap is invalid, min code length was {minCodeLength}, max was {maxCodeLength}.");
}

private static byte ReadByte(IInputBytes bytes)
private static byte ReadByte(IInputBytes bytes, bool useLenientParsing)
{
if (!bytes.MoveNext())
{
if (useLenientParsing)
{
// See issue #692
return 0;
}

throw new InvalidOperationException("Read byte called on input bytes which was at end of byte set. Current offset: " + bytes.CurrentOffset);
}

Expand All @@ -208,6 +213,5 @@ private static int ByteArrayToInt(ReadOnlySpan<byte> data)
}
return code;
}

}
}
6 changes: 3 additions & 3 deletions src/UglyToad.PdfPig/PdfFonts/Composite/ToUnicodeCMap.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
/// Defines the information content (actual text) of the font
/// as opposed to the display format.
/// </summary>
internal class ToUnicodeCMap
internal sealed class ToUnicodeCMap
{
private readonly CMap? cMap;

Expand Down Expand Up @@ -45,9 +45,9 @@ public bool TryGet(int code, [NotNullWhen(true)] out string? value)
return cMap.TryConvertToUnicode(code, out value);
}

public int ReadCode(IInputBytes inputBytes)
public int ReadCode(IInputBytes inputBytes, bool useLenientParsing)
{
return cMap!.ReadCode(inputBytes);
return cMap!.ReadCode(inputBytes, useLenientParsing);
}
}
}
7 changes: 6 additions & 1 deletion src/UglyToad.PdfPig/PdfFonts/Composite/Type0Font.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ internal sealed class Type0Font : IFont, IVerticalWritingSupported
private readonly Dictionary<int, CharacterBoundingBox> boundingBoxCache
= new Dictionary<int, CharacterBoundingBox>();

private readonly bool useLenientParsing;

public NameToken Name => BaseFont;

public NameToken BaseFont { get; }
Expand All @@ -41,6 +43,7 @@ public Type0Font(
CMap cmap,
CMap? toUnicodeCMap,
CMap? ucs2CMap,
ParsingOptions parsingOptions,
bool isChineseJapaneseOrKorean)
{
this.ucs2CMap = ucs2CMap;
Expand All @@ -52,13 +55,15 @@ public Type0Font(
ToUnicode = new ToUnicodeCMap(toUnicodeCMap);
Details = cidFont.Details?.WithName(Name.Data)
?? FontDetails.GetDefault(Name.Data);

useLenientParsing = parsingOptions.UseLenientParsing;
}

public int ReadCharacterCode(IInputBytes bytes, out int codeLength)
{
var current = bytes.CurrentOffset;

var code = CMap.ReadCode(bytes);
var code = CMap.ReadCode(bytes, useLenientParsing);

codeLength = (int)(bytes.CurrentOffset - current);

Expand Down
10 changes: 6 additions & 4 deletions src/UglyToad.PdfPig/PdfFonts/Parser/Handlers/Type0FontHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,25 @@
using Tokens;
using Util;

internal class Type0FontHandler : IFontHandler
internal sealed class Type0FontHandler : IFontHandler
{
private readonly CidFontFactory cidFontFactory;
private readonly ILookupFilterProvider filterProvider;
private readonly IPdfTokenScanner scanner;
private readonly ILog logger;
private readonly ParsingOptions parsingOptions;

public Type0FontHandler(
CidFontFactory cidFontFactory,
ILookupFilterProvider filterProvider,
IPdfTokenScanner scanner,
ILog logger)
ParsingOptions parsingOptions)
{
this.cidFontFactory = cidFontFactory;
this.filterProvider = filterProvider;
this.scanner = scanner;
this.logger = logger;
logger = parsingOptions.Logger;
this.parsingOptions = parsingOptions;
}

public IFont Generate(DictionaryToken dictionary)
Expand Down Expand Up @@ -91,7 +93,7 @@ public IFont Generate(DictionaryToken dictionary)
}
}

var font = new Type0Font(baseFont!, cidFont, cMap, toUnicodeCMap, ucs2CMap, isChineseJapaneseOrKorean);
var font = new Type0Font(baseFont!, cidFont, cMap, toUnicodeCMap, ucs2CMap, parsingOptions, isChineseJapaneseOrKorean);

return font;
}
Expand Down
27 changes: 13 additions & 14 deletions src/UglyToad.PdfPig/PdfFonts/Parser/Parts/CidFontFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
using UglyToad.PdfPig.Logging;
using Util;

internal class CidFontFactory
internal sealed class CidFontFactory
{
private readonly ILookupFilterProvider filterProvider;
private readonly IPdfTokenScanner pdfScanner;
Expand Down Expand Up @@ -46,7 +46,7 @@ public CidFontFactory(ILog log, IPdfTokenScanner pdfScanner, ILookupFilterProvid
defaultWidth = defaultWidthToken.Double;
}

var verticalWritingMetrics = ReadVerticalDisplacements(dictionary);
var verticalWritingMetrics = ReadVerticalDisplacements(dictionary, pdfScanner);

FontDescriptor? descriptor = null;
if (TryGetFontDescriptor(dictionary, out var descriptorDictionary))
Expand Down Expand Up @@ -190,7 +190,7 @@ private IReadOnlyDictionary<int, double> ReadWidths(DictionaryToken dict)
return widths;
}

private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken dict)
private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken dict, IPdfTokenScanner pdfScanner)
{
var verticalDisplacements = new Dictionary<int, double>();
var positionVectors = new Dictionary<int, PdfVector>();
Expand All @@ -210,22 +210,21 @@ private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken
}

// vertical metrics for individual CIDs.
if (dict.TryGet(NameToken.W2, out var w2Token) && w2Token is ArrayToken w2)
if (dict.TryGet(NameToken.W2, pdfScanner, out ArrayToken? w2))
{
for (var i = 0; i < w2.Data.Count; i++)
{
var c = (NumericToken)w2.Data[i];
var c = DirectObjectFinder.Get<NumericToken>(w2.Data[i], pdfScanner);
var next = w2.Data[++i];

if (next is ArrayToken array)
if (DirectObjectFinder.TryGet(next, pdfScanner, out ArrayToken? array))
{
for (var j = 0; j < array.Data.Count; j++)
{
var cid = c.Int + j;
// ReSharper disable InconsistentNaming
var w1y = (NumericToken)array.Data[j];
var v1x = (NumericToken)array.Data[++j];
var v1y = (NumericToken)array.Data[++j];
var w1y = DirectObjectFinder.Get<NumericToken>(array.Data[j], pdfScanner);
var v1x = DirectObjectFinder.Get<NumericToken>(array.Data[++j], pdfScanner);
var v1y = DirectObjectFinder.Get<NumericToken>(array.Data[++j], pdfScanner);

verticalDisplacements[cid] = w1y.Double;

Expand All @@ -236,9 +235,9 @@ private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken
{
var first = c.Int;
var last = ((NumericToken)next).Int;
var w1y = (NumericToken)w2.Data[++i];
var v1x = (NumericToken)w2.Data[++i];
var v1y = (NumericToken)w2.Data[++i];
var w1y = DirectObjectFinder.Get<NumericToken>(w2.Data[++i], pdfScanner);
var v1x = DirectObjectFinder.Get<NumericToken>(w2.Data[++i], pdfScanner);
var v1y = DirectObjectFinder.Get<NumericToken>(w2.Data[++i], pdfScanner);
// ReSharper restore InconsistentNaming

for (var cid = first; cid <= last; cid++)
Expand All @@ -250,7 +249,7 @@ private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken
}
}
}

return new VerticalWritingMetrics(dw2, verticalDisplacements, positionVectors);
}

Expand Down
Loading