{-# LANGUAGE OverloadedStrings #-}

-- | PDF document page

module Pdf.Document.Page
(
  Page,
  pageParentNode,
  pageContents,
  pageMediaBox,
  pageFontDicts,
  pageExtractText,
  pageExtractGlyphs,
  glyphsToText
)
where

import Pdf.Core.Object
import Pdf.Core.Object.Util
import Pdf.Core.Exception
import Pdf.Core.Util
import Pdf.Content

import Pdf.Document.Pdf
import Pdf.Document.Types
import Pdf.Document.PageNode
import Pdf.Document.FontDict
import Pdf.Document.Internal.Types
import Pdf.Document.Internal.Util

import Data.Maybe
import qualified Data.List as List
import qualified Data.Traversable as Traversable
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (ByteString)
import qualified Data.ByteString.Lazy as Lazy.ByteString
import Data.Text (Text)
import qualified Data.Text.Lazy as Lazy.Text
import qualified Data.Text.Lazy.Builder as Text.Builder
import Data.Map (Map)
import qualified Data.Map as Map
import qualified Data.Vector as Vector
import qualified Data.HashMap.Strict as HashMap
import Control.Monad
import Control.Monad.IO.Class
import Control.Exception hiding (throw)
import System.IO.Streams (InputStream)
import qualified System.IO.Streams as Streams
import qualified System.IO.Streams.Attoparsec as Streams

-- | Page's parent node
pageParentNode :: Page -> IO PageNode
pageParentNode :: Page -> IO PageNode
pageParentNode (Page Pdf
pdf Ref
_ Dict
dict) = do
  Ref
ref <- Either String Ref -> IO Ref
forall a. Either String a -> IO a
sure (Either String Ref -> IO Ref) -> Either String Ref -> IO Ref
forall a b. (a -> b) -> a -> b
$ (Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"Parent" Dict
dict Maybe Object -> (Object -> Maybe Ref) -> Maybe Ref
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Object -> Maybe Ref
refValue)
      Maybe Ref -> String -> Either String Ref
forall a. Maybe a -> String -> Either String a
`notice` String
"Parent should be a reference"
  PageTree
node <- Pdf -> Ref -> IO PageTree
loadPageNode Pdf
pdf Ref
ref
  case PageTree
node of
    PageTreeNode PageNode
n -> PageNode -> IO PageNode
forall (m :: * -> *) a. Monad m => a -> m a
return PageNode
n
    PageTreeLeaf Page
_ -> Corrupted -> IO PageNode
forall e a. Exception e => e -> IO a
throwIO (Corrupted -> IO PageNode) -> Corrupted -> IO PageNode
forall a b. (a -> b) -> a -> b
$ String -> [String] -> Corrupted
Corrupted
      String
"page parent should be a note, but leaf found" []

-- | List of references to page's content streams
pageContents :: Page -> IO [Ref]
pageContents :: Page -> IO [Ref]
pageContents (Page Pdf
pdf Ref
pageRef Dict
dict) =
  String -> IO [Ref] -> IO [Ref]
forall a. String -> IO a -> IO a
message (String
"contents for page: " String -> String -> String
forall a. [a] -> [a] -> [a]
++ Ref -> String
forall a. Show a => a -> String
show Ref
pageRef) (IO [Ref] -> IO [Ref]) -> IO [Ref] -> IO [Ref]
forall a b. (a -> b) -> a -> b
$ do
  case Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"Contents" Dict
dict of
    Maybe Object
Nothing -> [Ref] -> IO [Ref]
forall (m :: * -> *) a. Monad m => a -> m a
return []
    Just (Ref Ref
ref) -> do
      -- it could be reference to the only content stream,
      -- or to an array of content streams
      Object
o <- Pdf -> Ref -> IO Object
lookupObject Pdf
pdf Ref
ref IO Object -> (Object -> IO Object) -> IO Object
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Pdf -> Object -> IO Object
deref Pdf
pdf
      case Object
o of
        Stream Stream
_ -> [Ref] -> IO [Ref]
forall (m :: * -> *) a. Monad m => a -> m a
return [Ref
ref]
        Array Array
objs -> [Object] -> (Object -> IO Ref) -> IO [Ref]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM (Array -> [Object]
forall a. Vector a -> [a]
Vector.toList Array
objs) ((Object -> IO Ref) -> IO [Ref]) -> (Object -> IO Ref) -> IO [Ref]
forall a b. (a -> b) -> a -> b
$ \Object
obj ->
          Either String Ref -> IO Ref
forall a. Either String a -> IO a
sure (Either String Ref -> IO Ref) -> Either String Ref -> IO Ref
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Ref
refValue Object
obj Maybe Ref -> String -> Either String Ref
forall a. Maybe a -> String -> Either String a
`notice` String
"Content should be a reference"
        Object
_ -> Corrupted -> IO [Ref]
forall e a. Exception e => e -> IO a
throwIO (Corrupted -> IO [Ref]) -> Corrupted -> IO [Ref]
forall a b. (a -> b) -> a -> b
$ String -> [String] -> Corrupted
Corrupted
          (String
"Unexpected value in page content ref: " String -> String -> String
forall a. [a] -> [a] -> [a]
++ Object -> String
forall a. Show a => a -> String
show Object
o) []
    Just (Array Array
objs) -> [Object] -> (Object -> IO Ref) -> IO [Ref]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM (Array -> [Object]
forall a. Vector a -> [a]
Vector.toList Array
objs) ((Object -> IO Ref) -> IO [Ref]) -> (Object -> IO Ref) -> IO [Ref]
forall a b. (a -> b) -> a -> b
$ \Object
obj ->
      Either String Ref -> IO Ref
forall a. Either String a -> IO a
sure (Either String Ref -> IO Ref) -> Either String Ref -> IO Ref
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Ref
refValue Object
obj Maybe Ref -> String -> Either String Ref
forall a. Maybe a -> String -> Either String a
`notice` String
"Content should be a reference"
    Maybe Object
_ -> Corrupted -> IO [Ref]
forall e a. Exception e => e -> IO a
throwIO (Corrupted -> IO [Ref]) -> Corrupted -> IO [Ref]
forall a b. (a -> b) -> a -> b
$ String -> [String] -> Corrupted
Corrupted String
"Unexpected value in page contents" []

-- | Media box, inheritable
pageMediaBox :: Page -> IO (Rectangle Double)
pageMediaBox :: Page -> IO (Rectangle Double)
pageMediaBox Page
page = PageTree -> IO (Rectangle Double)
mediaBoxRec (Page -> PageTree
PageTreeLeaf Page
page)

mediaBoxRec :: PageTree -> IO (Rectangle Double)
mediaBoxRec :: PageTree -> IO (Rectangle Double)
mediaBoxRec PageTree
tree = do
  let (Pdf
pdf, Dict
dict) =
        case PageTree
tree of
          PageTreeNode (PageNode Pdf
p Ref
_ Dict
d) -> (Pdf
p, Dict
d)
          PageTreeLeaf (Page Pdf
p Ref
_ Dict
d) -> (Pdf
p, Dict
d)
  case Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"MediaBox" Dict
dict of
    Just Object
box -> do
      Object
box' <- Pdf -> Object -> IO Object
deref Pdf
pdf Object
box
      Array
arr <- Either String Array -> IO Array
forall a. Either String a -> IO a
sure (Either String Array -> IO Array)
-> Either String Array -> IO Array
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Array
arrayValue Object
box'
          Maybe Array -> String -> Either String Array
forall a. Maybe a -> String -> Either String a
`notice` String
"MediaBox should be an array"
      Either String (Rectangle Double) -> IO (Rectangle Double)
forall a. Either String a -> IO a
sure (Either String (Rectangle Double) -> IO (Rectangle Double))
-> Either String (Rectangle Double) -> IO (Rectangle Double)
forall a b. (a -> b) -> a -> b
$ Array -> Either String (Rectangle Double)
rectangleFromArray Array
arr
    Maybe Object
Nothing -> do
      PageTree
parent <-
        case PageTree
tree of
          PageTreeNode PageNode
node -> do
            Maybe PageNode
parent <- PageNode -> IO (Maybe PageNode)
pageNodeParent PageNode
node
            case Maybe PageNode
parent of
              Maybe PageNode
Nothing -> Corrupted -> IO PageTree
forall e a. Exception e => e -> IO a
throwIO (Corrupted -> IO PageTree) -> Corrupted -> IO PageTree
forall a b. (a -> b) -> a -> b
$ String -> [String] -> Corrupted
Corrupted String
"Media box not found" []
              Just PageNode
p -> PageTree -> IO PageTree
forall (m :: * -> *) a. Monad m => a -> m a
return (PageNode -> PageTree
PageTreeNode PageNode
p)
          PageTreeLeaf Page
page -> PageNode -> PageTree
PageTreeNode (PageNode -> PageTree) -> IO PageNode -> IO PageTree
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Page -> IO PageNode
pageParentNode Page
page
      PageTree -> IO (Rectangle Double)
mediaBoxRec PageTree
parent

-- | Font dictionaries for the page
pageFontDicts :: Page -> IO [(Name, FontDict)]
pageFontDicts :: Page -> IO [(Name, FontDict)]
pageFontDicts (Page Pdf
pdf Ref
_ Dict
dict) =
  case Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"Resources" Dict
dict of
    Maybe Object
Nothing -> [(Name, FontDict)] -> IO [(Name, FontDict)]
forall (m :: * -> *) a. Monad m => a -> m a
return []
    Just Object
res -> do
      Object
res' <- Pdf -> Object -> IO Object
deref Pdf
pdf Object
res
      Dict
resDict <- Either String Dict -> IO Dict
forall a. Either String a -> IO a
sure (Either String Dict -> IO Dict) -> Either String Dict -> IO Dict
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Dict
dictValue Object
res'
          Maybe Dict -> String -> Either String Dict
forall a. Maybe a -> String -> Either String a
`notice` String
"Resources should be a dictionary"
      case Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"Font" Dict
resDict of
        Maybe Object
Nothing -> [(Name, FontDict)] -> IO [(Name, FontDict)]
forall (m :: * -> *) a. Monad m => a -> m a
return []
        Just Object
fonts -> do
          Object
fonts' <- Pdf -> Object -> IO Object
deref Pdf
pdf Object
fonts
          Dict
fontsDict <- Either String Dict -> IO Dict
forall a. Either String a -> IO a
sure (Either String Dict -> IO Dict) -> Either String Dict -> IO Dict
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Dict
dictValue Object
fonts'
              Maybe Dict -> String -> Either String Dict
forall a. Maybe a -> String -> Either String a
`notice` String
"Font should be a dictionary"
          [(Name, Object)]
-> ((Name, Object) -> IO (Name, FontDict)) -> IO [(Name, FontDict)]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM (Dict -> [(Name, Object)]
forall k v. HashMap k v -> [(k, v)]
HashMap.toList Dict
fontsDict) (((Name, Object) -> IO (Name, FontDict)) -> IO [(Name, FontDict)])
-> ((Name, Object) -> IO (Name, FontDict)) -> IO [(Name, FontDict)]
forall a b. (a -> b) -> a -> b
$ \(Name
name, Object
font) -> do
            Object
font' <- Pdf -> Object -> IO Object
deref Pdf
pdf Object
font
            Dict
fontDict <- Either String Dict -> IO Dict
forall a. Either String a -> IO a
sure (Either String Dict -> IO Dict) -> Either String Dict -> IO Dict
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Dict
dictValue Object
font'
                Maybe Dict -> String -> Either String Dict
forall a. Maybe a -> String -> Either String a
`notice` String
"Each font should be a dictionary"
            Name -> Dict -> IO ()
ensureType Name
"Font" Dict
fontDict
            (Name, FontDict) -> IO (Name, FontDict)
forall (m :: * -> *) a. Monad m => a -> m a
return (Name
name, Pdf -> Dict -> FontDict
FontDict Pdf
pdf Dict
fontDict)

data XObject = XObject
  { XObject -> ByteString
xobjectContent :: Lazy.ByteString
  , XObject -> GlyphDecoder
xobjectGlyphDecoder :: GlyphDecoder
  , XObject -> Map Name XObject
xobjectChildren :: Map Name XObject
  }

instance Show XObject where
  show :: XObject -> String
show XObject
xobj = (ByteString, Map Name XObject) -> String
forall a. Show a => a -> String
show (XObject -> ByteString
xobjectContent XObject
xobj, XObject -> Map Name XObject
xobjectChildren XObject
xobj)

pageXObjects :: Page -> IO (Map Name XObject)
pageXObjects :: Page -> IO (Map Name XObject)
pageXObjects (Page Pdf
pdf Ref
_ Dict
dict) = Pdf -> Dict -> IO (Map Name XObject)
dictXObjects Pdf
pdf Dict
dict

dictXObjects :: Pdf -> Dict -> IO (Map Name XObject)
dictXObjects :: Pdf -> Dict -> IO (Map Name XObject)
dictXObjects Pdf
pdf Dict
dict =
  case Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"Resources" Dict
dict of
    Maybe Object
Nothing -> Map Name XObject -> IO (Map Name XObject)
forall (m :: * -> *) a. Monad m => a -> m a
return Map Name XObject
forall k a. Map k a
Map.empty
    Just Object
res -> do
      Dict
resDict <- do
        Object
v <- Pdf -> Object -> IO Object
deref Pdf
pdf Object
res
        Either String Dict -> IO Dict
forall a. Either String a -> IO a
sure (Either String Dict -> IO Dict) -> Either String Dict -> IO Dict
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Dict
dictValue Object
v
          Maybe Dict -> String -> Either String Dict
forall a. Maybe a -> String -> Either String a
`notice` String
"Resources should be a dict"

      case Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"XObject" Dict
resDict of
        Maybe Object
Nothing -> Map Name XObject -> IO (Map Name XObject)
forall (m :: * -> *) a. Monad m => a -> m a
return Map Name XObject
forall k a. Map k a
Map.empty
        Just Object
xo -> do
          Dict
xosDict <- do
            Object
v <- Pdf -> Object -> IO Object
deref Pdf
pdf Object
xo
            Either String Dict -> IO Dict
forall a. Either String a -> IO a
sure (Either String Dict -> IO Dict) -> Either String Dict -> IO Dict
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Dict
dictValue Object
v
              Maybe Dict -> String -> Either String Dict
forall a. Maybe a -> String -> Either String a
`notice` String
"XObject should be a dict"
          [(Name, Maybe XObject)]
result <- [(Name, Object)]
-> ((Name, Object) -> IO (Name, Maybe XObject))
-> IO [(Name, Maybe XObject)]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM (Dict -> [(Name, Object)]
forall k v. HashMap k v -> [(k, v)]
HashMap.toList Dict
xosDict) (((Name, Object) -> IO (Name, Maybe XObject))
 -> IO [(Name, Maybe XObject)])
-> ((Name, Object) -> IO (Name, Maybe XObject))
-> IO [(Name, Maybe XObject)]
forall a b. (a -> b) -> a -> b
$ \(Name
name, Object
o) -> do
            Ref
ref <- Either String Ref -> IO Ref
forall a. Either String a -> IO a
sure (Either String Ref -> IO Ref) -> Either String Ref -> IO Ref
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Ref
refValue Object
o
              Maybe Ref -> String -> Either String Ref
forall a. Maybe a -> String -> Either String a
`notice` String
"Not a ref"
            s :: Stream
s@(S Dict
xoDict Int64
_) <- do
              Object
v <- Pdf -> Ref -> IO Object
lookupObject Pdf
pdf Ref
ref
              Either String Stream -> IO Stream
forall a. Either String a -> IO a
sure (Either String Stream -> IO Stream)
-> Either String Stream -> IO Stream
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Stream
streamValue Object
v
                Maybe Stream -> String -> Either String Stream
forall a. Maybe a -> String -> Either String a
`notice` String
"Not a stream"

            case Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"Subtype" Dict
xoDict of
              Just (Name Name
"Form") -> do
                InputStream ByteString
is <- Pdf -> Ref -> Stream -> IO (InputStream ByteString)
streamContent Pdf
pdf Ref
ref Stream
s
                ByteString
cont <- [ByteString] -> ByteString
Lazy.ByteString.fromChunks ([ByteString] -> ByteString) -> IO [ByteString] -> IO ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> InputStream ByteString -> IO [ByteString]
forall a. InputStream a -> IO [a]
Streams.toList InputStream ByteString
is

                Map Name FontDict
fontDicts <- [(Name, FontDict)] -> Map Name FontDict
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(Name, FontDict)] -> Map Name FontDict)
-> IO [(Name, FontDict)] -> IO (Map Name FontDict)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
                  Page -> IO [(Name, FontDict)]
pageFontDicts (Pdf -> Ref -> Dict -> Page
Page Pdf
pdf Ref
ref Dict
xoDict)

                Map Name (ByteString -> [(Glyph, Double)])
glyphDecoders <- Map Name FontDict
-> (FontDict -> IO (ByteString -> [(Glyph, Double)]))
-> IO (Map Name (ByteString -> [(Glyph, Double)]))
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
Traversable.forM Map Name FontDict
fontDicts ((FontDict -> IO (ByteString -> [(Glyph, Double)]))
 -> IO (Map Name (ByteString -> [(Glyph, Double)])))
-> (FontDict -> IO (ByteString -> [(Glyph, Double)]))
-> IO (Map Name (ByteString -> [(Glyph, Double)]))
forall a b. (a -> b) -> a -> b
$ \FontDict
fontDict ->
                  FontInfo -> ByteString -> [(Glyph, Double)]
fontInfoDecodeGlyphs (FontInfo -> ByteString -> [(Glyph, Double)])
-> IO FontInfo -> IO (ByteString -> [(Glyph, Double)])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FontDict -> IO FontInfo
fontDictLoadInfo FontDict
fontDict
                let glyphDecoder :: GlyphDecoder
glyphDecoder Name
fontName = \ByteString
str ->
                      case Name
-> Map Name (ByteString -> [(Glyph, Double)])
-> Maybe (ByteString -> [(Glyph, Double)])
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Name
fontName Map Name (ByteString -> [(Glyph, Double)])
glyphDecoders of
                        Maybe (ByteString -> [(Glyph, Double)])
Nothing -> []
                        Just ByteString -> [(Glyph, Double)]
decode -> ByteString -> [(Glyph, Double)]
decode ByteString
str

                Map Name XObject
children <- Pdf -> Dict -> IO (Map Name XObject)
dictXObjects Pdf
pdf Dict
xoDict

                let xobj :: XObject
xobj = XObject :: ByteString -> GlyphDecoder -> Map Name XObject -> XObject
XObject
                      { xobjectContent :: ByteString
xobjectContent = ByteString
cont
                      , xobjectGlyphDecoder :: GlyphDecoder
xobjectGlyphDecoder = GlyphDecoder
glyphDecoder
                      , xobjectChildren :: Map Name XObject
xobjectChildren = Map Name XObject
children
                      }
                (Name, Maybe XObject) -> IO (Name, Maybe XObject)
forall (m :: * -> *) a. Monad m => a -> m a
return (Name
name, XObject -> Maybe XObject
forall a. a -> Maybe a
Just XObject
xobj)

              Maybe Object
_ -> (Name, Maybe XObject) -> IO (Name, Maybe XObject)
forall (m :: * -> *) a. Monad m => a -> m a
return (Name
name, Maybe XObject
forall a. Maybe a
Nothing)

          Map Name XObject -> IO (Map Name XObject)
forall (m :: * -> *) a. Monad m => a -> m a
return (Map Name XObject -> IO (Map Name XObject))
-> Map Name XObject -> IO (Map Name XObject)
forall a b. (a -> b) -> a -> b
$ [(Name, XObject)] -> Map Name XObject
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(Name, XObject)] -> Map Name XObject)
-> [(Name, XObject)] -> Map Name XObject
forall a b. (a -> b) -> a -> b
$ (((Name, Maybe XObject) -> Maybe (Name, XObject))
 -> [(Name, Maybe XObject)] -> [(Name, XObject)])
-> [(Name, Maybe XObject)]
-> ((Name, Maybe XObject) -> Maybe (Name, XObject))
-> [(Name, XObject)]
forall a b c. (a -> b -> c) -> b -> a -> c
flip ((Name, Maybe XObject) -> Maybe (Name, XObject))
-> [(Name, Maybe XObject)] -> [(Name, XObject)]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe [(Name, Maybe XObject)]
result (((Name, Maybe XObject) -> Maybe (Name, XObject))
 -> [(Name, XObject)])
-> ((Name, Maybe XObject) -> Maybe (Name, XObject))
-> [(Name, XObject)]
forall a b. (a -> b) -> a -> b
$ \(Name
n, Maybe XObject
mo) -> do
            XObject
o <- Maybe XObject
mo
            (Name, XObject) -> Maybe (Name, XObject)
forall (m :: * -> *) a. Monad m => a -> m a
return (Name
n, XObject
o)

-- | Extract text from the page
--
-- It tries to add spaces between chars if they don't present
-- as actual characters in content stream.
pageExtractText :: Page -> IO Text
pageExtractText :: Page -> IO Text
pageExtractText Page
page = [Span] -> Text
glyphsToText ([Span] -> Text) -> IO [Span] -> IO Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Page -> IO [Span]
pageExtractGlyphs Page
page

pageExtractGlyphs :: Page -> IO [Span]
pageExtractGlyphs :: Page -> IO [Span]
pageExtractGlyphs Page
page = do
  Map Name FontDict
fontDicts <- [(Name, FontDict)] -> Map Name FontDict
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(Name, FontDict)] -> Map Name FontDict)
-> IO [(Name, FontDict)] -> IO (Map Name FontDict)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Page -> IO [(Name, FontDict)]
pageFontDicts Page
page
  Map Name (ByteString -> [(Glyph, Double)])
glyphDecoders <- Map Name FontDict
-> (FontDict -> IO (ByteString -> [(Glyph, Double)]))
-> IO (Map Name (ByteString -> [(Glyph, Double)]))
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
Traversable.forM Map Name FontDict
fontDicts ((FontDict -> IO (ByteString -> [(Glyph, Double)]))
 -> IO (Map Name (ByteString -> [(Glyph, Double)])))
-> (FontDict -> IO (ByteString -> [(Glyph, Double)]))
-> IO (Map Name (ByteString -> [(Glyph, Double)]))
forall a b. (a -> b) -> a -> b
$ \FontDict
fontDict ->
    FontInfo -> ByteString -> [(Glyph, Double)]
fontInfoDecodeGlyphs (FontInfo -> ByteString -> [(Glyph, Double)])
-> IO FontInfo -> IO (ByteString -> [(Glyph, Double)])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FontDict -> IO FontInfo
fontDictLoadInfo FontDict
fontDict
  let glyphDecoder :: GlyphDecoder
glyphDecoder Name
fontName = \ByteString
str ->
        case Name
-> Map Name (ByteString -> [(Glyph, Double)])
-> Maybe (ByteString -> [(Glyph, Double)])
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Name
fontName Map Name (ByteString -> [(Glyph, Double)])
glyphDecoders of
          Maybe (ByteString -> [(Glyph, Double)])
Nothing -> []
          Just ByteString -> [(Glyph, Double)]
decode -> ByteString -> [(Glyph, Double)]
decode ByteString
str

  Map Name XObject
xobjects <- Page -> IO (Map Name XObject)
pageXObjects Page
page

  InputStream Expr
is <- do
    [Ref]
contents <- Page -> IO [Ref]
pageContents Page
page
    let Page Pdf
pdf Ref
_ Dict
_ = Page
page
    InputStream ByteString
is <- Pdf -> [Ref] -> IO (InputStream ByteString)
combinedContent Pdf
pdf [Ref]
contents
    Parser (Maybe Expr)
-> InputStream ByteString -> IO (InputStream Expr)
forall r.
Parser (Maybe r) -> InputStream ByteString -> IO (InputStream r)
Streams.parserToInputStream Parser (Maybe Expr)
parseContent InputStream ByteString
is

  -- use content stream processor to extract text
  let loop :: Map Name XObject -> InputStream Expr -> Processor -> IO Processor
loop Map Name XObject
xobjs InputStream Expr
s Processor
p = do
        Maybe Operator
next <- InputStream Expr -> IO (Maybe Operator)
readNextOperator InputStream Expr
s
        case Maybe Operator
next of
          Just (Op
Op_Do, [Name Name
name]) -> Map Name XObject -> Name -> Processor -> IO Processor
processDo Map Name XObject
xobjs Name
name Processor
p IO Processor -> (Processor -> IO Processor) -> IO Processor
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Map Name XObject -> InputStream Expr -> Processor -> IO Processor
loop Map Name XObject
xobjs InputStream Expr
s
          Just Operator
op ->
            case Operator -> Processor -> Either String Processor
processOp Operator
op Processor
p of
              Left String
err -> Unexpected -> IO Processor
forall e a. Exception e => e -> IO a
throwIO (String -> [String] -> Unexpected
Unexpected String
err [])
              Right  Processor
p' -> Map Name XObject -> InputStream Expr -> Processor -> IO Processor
loop Map Name XObject
xobjs InputStream Expr
s Processor
p'
          Maybe Operator
Nothing -> Processor -> IO Processor
forall (m :: * -> *) a. Monad m => a -> m a
return Processor
p

      processDo :: Map Name XObject -> Name -> Processor -> IO Processor
processDo Map Name XObject
xobjs Name
name Processor
p = do
        case Name -> Map Name XObject -> Maybe XObject
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Name
name Map Name XObject
xobjs of
          Maybe XObject
Nothing -> Processor -> IO Processor
forall (m :: * -> *) a. Monad m => a -> m a
return Processor
p
          Just XObject
xobj -> do
            InputStream Expr
s <- do
              InputStream ByteString
s <- ByteString -> IO (InputStream ByteString)
Streams.fromLazyByteString (XObject -> ByteString
xobjectContent XObject
xobj)
              Parser (Maybe Expr)
-> InputStream ByteString -> IO (InputStream Expr)
forall r.
Parser (Maybe r) -> InputStream ByteString -> IO (InputStream r)
Streams.parserToInputStream Parser (Maybe Expr)
parseContent InputStream ByteString
s

            let gdec' :: GlyphDecoder
gdec' = Processor -> GlyphDecoder
prGlyphDecoder Processor
p
            Processor
p' <- Map Name XObject -> InputStream Expr -> Processor -> IO Processor
loop (XObject -> Map Name XObject
xobjectChildren XObject
xobj) InputStream Expr
s
              (Processor
p {prGlyphDecoder :: GlyphDecoder
prGlyphDecoder = XObject -> GlyphDecoder
xobjectGlyphDecoder XObject
xobj})
            Processor -> IO Processor
forall (m :: * -> *) a. Monad m => a -> m a
return (Processor
p' {prGlyphDecoder :: GlyphDecoder
prGlyphDecoder = GlyphDecoder
gdec'})

  Processor
p <- Map Name XObject -> InputStream Expr -> Processor -> IO Processor
loop Map Name XObject
xobjects InputStream Expr
is (Processor -> IO Processor) -> Processor -> IO Processor
forall a b. (a -> b) -> a -> b
$ Processor
mkProcessor {
    prGlyphDecoder :: GlyphDecoder
prGlyphDecoder = GlyphDecoder
glyphDecoder
    }
  [Span] -> IO [Span]
forall (m :: * -> *) a. Monad m => a -> m a
return ([Span] -> [Span]
forall a. [a] -> [a]
List.reverse (Processor -> [Span]
prSpans Processor
p))

combinedContent :: Pdf -> [Ref] -> IO (InputStream ByteString)
combinedContent :: Pdf -> [Ref] -> IO (InputStream ByteString)
combinedContent Pdf
pdf [Ref]
refs = do
  [(Ref, Stream)]
allStreams <- [Ref] -> (Ref -> IO (Ref, Stream)) -> IO [(Ref, Stream)]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM [Ref]
refs ((Ref -> IO (Ref, Stream)) -> IO [(Ref, Stream)])
-> (Ref -> IO (Ref, Stream)) -> IO [(Ref, Stream)]
forall a b. (a -> b) -> a -> b
$ \Ref
ref -> do
    Object
o <- Pdf -> Ref -> IO Object
lookupObject Pdf
pdf Ref
ref
    case Object
o of
      Stream Stream
s -> (Ref, Stream) -> IO (Ref, Stream)
forall (m :: * -> *) a. Monad m => a -> m a
return (Ref
ref, Stream
s)
      Object
_ -> Corrupted -> IO (Ref, Stream)
forall e a. Exception e => e -> IO a
throwIO (String -> [String] -> Corrupted
Corrupted String
"Page content is not a stream" [])

  Generator ByteString () -> IO (InputStream ByteString)
forall r a. Generator r a -> IO (InputStream r)
Streams.fromGenerator (Generator ByteString () -> IO (InputStream ByteString))
-> Generator ByteString () -> IO (InputStream ByteString)
forall a b. (a -> b) -> a -> b
$ [(Ref, Stream)]
-> ((Ref, Stream) -> Generator ByteString ())
-> Generator ByteString ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [(Ref, Stream)]
allStreams (((Ref, Stream) -> Generator ByteString ())
 -> Generator ByteString ())
-> ((Ref, Stream) -> Generator ByteString ())
-> Generator ByteString ()
forall a b. (a -> b) -> a -> b
$ \(Ref
ref, Stream
stream) -> do
    InputStream ByteString
is <- IO (InputStream ByteString)
-> Generator ByteString (InputStream ByteString)
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO (InputStream ByteString)
 -> Generator ByteString (InputStream ByteString))
-> IO (InputStream ByteString)
-> Generator ByteString (InputStream ByteString)
forall a b. (a -> b) -> a -> b
$ Pdf -> Ref -> Stream -> IO (InputStream ByteString)
streamContent Pdf
pdf Ref
ref Stream
stream
    InputStream ByteString -> Generator ByteString ()
forall r. InputStream r -> Generator r ()
yield InputStream ByteString
is
  where
  yield :: InputStream r -> Generator r ()
yield InputStream r
is = do
    Maybe r
chunk <- IO (Maybe r) -> Generator r (Maybe r)
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO (Maybe r) -> Generator r (Maybe r))
-> IO (Maybe r) -> Generator r (Maybe r)
forall a b. (a -> b) -> a -> b
$ InputStream r -> IO (Maybe r)
forall a. InputStream a -> IO (Maybe a)
Streams.read InputStream r
is
    case Maybe r
chunk of
      Maybe r
Nothing -> () -> Generator r ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
      Just r
c -> do
        r -> Generator r ()
forall r. r -> Generator r ()
Streams.yield r
c
        InputStream r -> Generator r ()
yield InputStream r
is

-- | Convert glyphs to text, trying to add spaces and newlines
--
-- It takes list of spans. Each span is a list of glyphs that are outputed in one shot.
-- So we don't need to add space inside span, only between them.
glyphsToText :: [Span] -> Text
glyphsToText :: [Span] -> Text
glyphsToText
  = Text -> Text
Lazy.Text.toStrict
  (Text -> Text) -> ([Span] -> Text) -> [Span] -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Builder -> Text
Text.Builder.toLazyText
  (Builder -> Text) -> ([Span] -> Builder) -> [Span] -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Vector Double, Bool), Builder) -> Builder
forall a b. (a, b) -> b
snd
  (((Vector Double, Bool), Builder) -> Builder)
-> ([Span] -> ((Vector Double, Bool), Builder))
-> [Span]
-> Builder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (((Vector Double, Bool), Builder)
 -> [Glyph] -> ((Vector Double, Bool), Builder))
-> ((Vector Double, Bool), Builder)
-> [[Glyph]]
-> ((Vector Double, Bool), Builder)
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl ((Vector Double, Bool), Builder)
-> [Glyph] -> ((Vector Double, Bool), Builder)
step ((Double -> Double -> Vector Double
forall a. a -> a -> Vector a
Vector Double
0 Double
0, Bool
False), Builder
forall a. Monoid a => a
mempty)
  ([[Glyph]] -> ((Vector Double, Bool), Builder))
-> ([Span] -> [[Glyph]])
-> [Span]
-> ((Vector Double, Bool), Builder)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Span -> [Glyph]) -> [Span] -> [[Glyph]]
forall a b. (a -> b) -> [a] -> [b]
List.map Span -> [Glyph]
spGlyphs
  where
  step :: ((Vector Double, Bool), Builder)
-> [Glyph] -> ((Vector Double, Bool), Builder)
step ((Vector Double, Bool), Builder)
acc [] = ((Vector Double, Bool), Builder)
acc
  step ((Vector Double
lx2 Double
ly2, Bool
wasSpace), Builder
res) [Glyph]
sp =
    let Vector Double
x1 Double
y1 = Glyph -> Vector Double
glyphTopLeft ([Glyph] -> Glyph
forall a. [a] -> a
head [Glyph]
sp)
        Vector Double
x2 Double
_ = Glyph -> Vector Double
glyphBottomRight ([Glyph] -> Glyph
forall a. [a] -> a
last [Glyph]
sp)
        Vector Double
_ Double
y2 = Glyph -> Vector Double
glyphTopLeft ([Glyph] -> Glyph
forall a. [a] -> a
last [Glyph]
sp)
        space :: Builder
space =
          if Double -> Double
forall a. Num a => a -> a
abs (Double
ly2 Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
y1) Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
1.8
            then  if Bool
wasSpace Bool -> Bool -> Bool
|| Double -> Double
forall a. Num a => a -> a
abs (Double
lx2 Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
x1) Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
1.8
                    then Builder
forall a. Monoid a => a
mempty
                    else Char -> Builder
Text.Builder.singleton Char
' '
            else Char -> Builder
Text.Builder.singleton Char
'\n'
        txt :: Builder
txt = Text -> Builder
Text.Builder.fromLazyText (Text -> Builder) -> Text -> Builder
forall a b. (a -> b) -> a -> b
$ [Text] -> Text
Lazy.Text.fromChunks ([Text] -> Text) -> [Text] -> Text
forall a b. (a -> b) -> a -> b
$ (Glyph -> Maybe Text) -> [Glyph] -> [Text]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe Glyph -> Maybe Text
glyphText [Glyph]
sp
        endWithSpace :: Bool
endWithSpace = Glyph -> Maybe Text
glyphText ([Glyph] -> Glyph
forall a. [a] -> a
last [Glyph]
sp) Maybe Text -> Maybe Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text -> Maybe Text
forall a. a -> Maybe a
Just Text
" "
    in ((Double -> Double -> Vector Double
forall a. a -> a -> Vector a
Vector Double
x2 Double
y2, Bool
endWithSpace), [Builder] -> Builder
forall a. Monoid a => [a] -> a
mconcat [Builder
res, Builder
space, Builder
txt])