{-# LANGUAGE OverloadedStrings #-}
module Pdf.Document.Page
(
Page,
pageParentNode,
pageContents,
pageMediaBox,
pageFontDicts,
pageExtractText,
pageExtractGlyphs,
glyphsToText
)
where
import Pdf.Core.Object
import Pdf.Core.Object.Util
import Pdf.Core.Exception
import Pdf.Core.Util
import Pdf.Content
import Pdf.Document.Pdf
import Pdf.Document.Types
import Pdf.Document.PageNode
import Pdf.Document.FontDict
import Pdf.Document.Internal.Types
import Pdf.Document.Internal.Util
import Data.Maybe
import qualified Data.List as List
import qualified Data.Traversable as Traversable
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (ByteString)
import qualified Data.ByteString.Lazy as Lazy.ByteString
import Data.Text (Text)
import qualified Data.Text.Lazy as Lazy.Text
import qualified Data.Text.Lazy.Builder as Text.Builder
import Data.Map (Map)
import qualified Data.Map as Map
import qualified Data.Vector as Vector
import qualified Data.HashMap.Strict as HashMap
import Control.Monad
import Control.Monad.IO.Class
import Control.Exception hiding (throw)
import System.IO.Streams (InputStream)
import qualified System.IO.Streams as Streams
import qualified System.IO.Streams.Attoparsec as Streams
pageParentNode :: Page -> IO PageNode
pageParentNode :: Page -> IO PageNode
pageParentNode (Page Pdf
pdf Ref
_ Dict
dict) = do
Ref
ref <- Either String Ref -> IO Ref
forall a. Either String a -> IO a
sure (Either String Ref -> IO Ref) -> Either String Ref -> IO Ref
forall a b. (a -> b) -> a -> b
$ (Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"Parent" Dict
dict Maybe Object -> (Object -> Maybe Ref) -> Maybe Ref
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Object -> Maybe Ref
refValue)
Maybe Ref -> String -> Either String Ref
forall a. Maybe a -> String -> Either String a
`notice` String
"Parent should be a reference"
PageTree
node <- Pdf -> Ref -> IO PageTree
loadPageNode Pdf
pdf Ref
ref
case PageTree
node of
PageTreeNode PageNode
n -> PageNode -> IO PageNode
forall (m :: * -> *) a. Monad m => a -> m a
return PageNode
n
PageTreeLeaf Page
_ -> Corrupted -> IO PageNode
forall e a. Exception e => e -> IO a
throwIO (Corrupted -> IO PageNode) -> Corrupted -> IO PageNode
forall a b. (a -> b) -> a -> b
$ String -> [String] -> Corrupted
Corrupted
String
"page parent should be a note, but leaf found" []
pageContents :: Page -> IO [Ref]
pageContents :: Page -> IO [Ref]
pageContents (Page Pdf
pdf Ref
pageRef Dict
dict) =
String -> IO [Ref] -> IO [Ref]
forall a. String -> IO a -> IO a
message (String
"contents for page: " String -> String -> String
forall a. [a] -> [a] -> [a]
++ Ref -> String
forall a. Show a => a -> String
show Ref
pageRef) (IO [Ref] -> IO [Ref]) -> IO [Ref] -> IO [Ref]
forall a b. (a -> b) -> a -> b
$ do
case Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"Contents" Dict
dict of
Maybe Object
Nothing -> [Ref] -> IO [Ref]
forall (m :: * -> *) a. Monad m => a -> m a
return []
Just (Ref Ref
ref) -> do
Object
o <- Pdf -> Ref -> IO Object
lookupObject Pdf
pdf Ref
ref IO Object -> (Object -> IO Object) -> IO Object
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Pdf -> Object -> IO Object
deref Pdf
pdf
case Object
o of
Stream Stream
_ -> [Ref] -> IO [Ref]
forall (m :: * -> *) a. Monad m => a -> m a
return [Ref
ref]
Array Array
objs -> [Object] -> (Object -> IO Ref) -> IO [Ref]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM (Array -> [Object]
forall a. Vector a -> [a]
Vector.toList Array
objs) ((Object -> IO Ref) -> IO [Ref]) -> (Object -> IO Ref) -> IO [Ref]
forall a b. (a -> b) -> a -> b
$ \Object
obj ->
Either String Ref -> IO Ref
forall a. Either String a -> IO a
sure (Either String Ref -> IO Ref) -> Either String Ref -> IO Ref
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Ref
refValue Object
obj Maybe Ref -> String -> Either String Ref
forall a. Maybe a -> String -> Either String a
`notice` String
"Content should be a reference"
Object
_ -> Corrupted -> IO [Ref]
forall e a. Exception e => e -> IO a
throwIO (Corrupted -> IO [Ref]) -> Corrupted -> IO [Ref]
forall a b. (a -> b) -> a -> b
$ String -> [String] -> Corrupted
Corrupted
(String
"Unexpected value in page content ref: " String -> String -> String
forall a. [a] -> [a] -> [a]
++ Object -> String
forall a. Show a => a -> String
show Object
o) []
Just (Array Array
objs) -> [Object] -> (Object -> IO Ref) -> IO [Ref]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM (Array -> [Object]
forall a. Vector a -> [a]
Vector.toList Array
objs) ((Object -> IO Ref) -> IO [Ref]) -> (Object -> IO Ref) -> IO [Ref]
forall a b. (a -> b) -> a -> b
$ \Object
obj ->
Either String Ref -> IO Ref
forall a. Either String a -> IO a
sure (Either String Ref -> IO Ref) -> Either String Ref -> IO Ref
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Ref
refValue Object
obj Maybe Ref -> String -> Either String Ref
forall a. Maybe a -> String -> Either String a
`notice` String
"Content should be a reference"
Maybe Object
_ -> Corrupted -> IO [Ref]
forall e a. Exception e => e -> IO a
throwIO (Corrupted -> IO [Ref]) -> Corrupted -> IO [Ref]
forall a b. (a -> b) -> a -> b
$ String -> [String] -> Corrupted
Corrupted String
"Unexpected value in page contents" []
pageMediaBox :: Page -> IO (Rectangle Double)
pageMediaBox :: Page -> IO (Rectangle Double)
pageMediaBox Page
page = PageTree -> IO (Rectangle Double)
mediaBoxRec (Page -> PageTree
PageTreeLeaf Page
page)
mediaBoxRec :: PageTree -> IO (Rectangle Double)
mediaBoxRec :: PageTree -> IO (Rectangle Double)
mediaBoxRec PageTree
tree = do
let (Pdf
pdf, Dict
dict) =
case PageTree
tree of
PageTreeNode (PageNode Pdf
p Ref
_ Dict
d) -> (Pdf
p, Dict
d)
PageTreeLeaf (Page Pdf
p Ref
_ Dict
d) -> (Pdf
p, Dict
d)
case Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"MediaBox" Dict
dict of
Just Object
box -> do
Object
box' <- Pdf -> Object -> IO Object
deref Pdf
pdf Object
box
Array
arr <- Either String Array -> IO Array
forall a. Either String a -> IO a
sure (Either String Array -> IO Array)
-> Either String Array -> IO Array
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Array
arrayValue Object
box'
Maybe Array -> String -> Either String Array
forall a. Maybe a -> String -> Either String a
`notice` String
"MediaBox should be an array"
Either String (Rectangle Double) -> IO (Rectangle Double)
forall a. Either String a -> IO a
sure (Either String (Rectangle Double) -> IO (Rectangle Double))
-> Either String (Rectangle Double) -> IO (Rectangle Double)
forall a b. (a -> b) -> a -> b
$ Array -> Either String (Rectangle Double)
rectangleFromArray Array
arr
Maybe Object
Nothing -> do
PageTree
parent <-
case PageTree
tree of
PageTreeNode PageNode
node -> do
Maybe PageNode
parent <- PageNode -> IO (Maybe PageNode)
pageNodeParent PageNode
node
case Maybe PageNode
parent of
Maybe PageNode
Nothing -> Corrupted -> IO PageTree
forall e a. Exception e => e -> IO a
throwIO (Corrupted -> IO PageTree) -> Corrupted -> IO PageTree
forall a b. (a -> b) -> a -> b
$ String -> [String] -> Corrupted
Corrupted String
"Media box not found" []
Just PageNode
p -> PageTree -> IO PageTree
forall (m :: * -> *) a. Monad m => a -> m a
return (PageNode -> PageTree
PageTreeNode PageNode
p)
PageTreeLeaf Page
page -> PageNode -> PageTree
PageTreeNode (PageNode -> PageTree) -> IO PageNode -> IO PageTree
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Page -> IO PageNode
pageParentNode Page
page
PageTree -> IO (Rectangle Double)
mediaBoxRec PageTree
parent
pageFontDicts :: Page -> IO [(Name, FontDict)]
pageFontDicts :: Page -> IO [(Name, FontDict)]
pageFontDicts (Page Pdf
pdf Ref
_ Dict
dict) =
case Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"Resources" Dict
dict of
Maybe Object
Nothing -> [(Name, FontDict)] -> IO [(Name, FontDict)]
forall (m :: * -> *) a. Monad m => a -> m a
return []
Just Object
res -> do
Object
res' <- Pdf -> Object -> IO Object
deref Pdf
pdf Object
res
Dict
resDict <- Either String Dict -> IO Dict
forall a. Either String a -> IO a
sure (Either String Dict -> IO Dict) -> Either String Dict -> IO Dict
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Dict
dictValue Object
res'
Maybe Dict -> String -> Either String Dict
forall a. Maybe a -> String -> Either String a
`notice` String
"Resources should be a dictionary"
case Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"Font" Dict
resDict of
Maybe Object
Nothing -> [(Name, FontDict)] -> IO [(Name, FontDict)]
forall (m :: * -> *) a. Monad m => a -> m a
return []
Just Object
fonts -> do
Object
fonts' <- Pdf -> Object -> IO Object
deref Pdf
pdf Object
fonts
Dict
fontsDict <- Either String Dict -> IO Dict
forall a. Either String a -> IO a
sure (Either String Dict -> IO Dict) -> Either String Dict -> IO Dict
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Dict
dictValue Object
fonts'
Maybe Dict -> String -> Either String Dict
forall a. Maybe a -> String -> Either String a
`notice` String
"Font should be a dictionary"
[(Name, Object)]
-> ((Name, Object) -> IO (Name, FontDict)) -> IO [(Name, FontDict)]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM (Dict -> [(Name, Object)]
forall k v. HashMap k v -> [(k, v)]
HashMap.toList Dict
fontsDict) (((Name, Object) -> IO (Name, FontDict)) -> IO [(Name, FontDict)])
-> ((Name, Object) -> IO (Name, FontDict)) -> IO [(Name, FontDict)]
forall a b. (a -> b) -> a -> b
$ \(Name
name, Object
font) -> do
Object
font' <- Pdf -> Object -> IO Object
deref Pdf
pdf Object
font
Dict
fontDict <- Either String Dict -> IO Dict
forall a. Either String a -> IO a
sure (Either String Dict -> IO Dict) -> Either String Dict -> IO Dict
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Dict
dictValue Object
font'
Maybe Dict -> String -> Either String Dict
forall a. Maybe a -> String -> Either String a
`notice` String
"Each font should be a dictionary"
Name -> Dict -> IO ()
ensureType Name
"Font" Dict
fontDict
(Name, FontDict) -> IO (Name, FontDict)
forall (m :: * -> *) a. Monad m => a -> m a
return (Name
name, Pdf -> Dict -> FontDict
FontDict Pdf
pdf Dict
fontDict)
data XObject = XObject
{ XObject -> ByteString
xobjectContent :: Lazy.ByteString
, XObject -> GlyphDecoder
xobjectGlyphDecoder :: GlyphDecoder
, XObject -> Map Name XObject
xobjectChildren :: Map Name XObject
}
instance Show XObject where
show :: XObject -> String
show XObject
xobj = (ByteString, Map Name XObject) -> String
forall a. Show a => a -> String
show (XObject -> ByteString
xobjectContent XObject
xobj, XObject -> Map Name XObject
xobjectChildren XObject
xobj)
pageXObjects :: Page -> IO (Map Name XObject)
pageXObjects :: Page -> IO (Map Name XObject)
pageXObjects (Page Pdf
pdf Ref
_ Dict
dict) = Pdf -> Dict -> IO (Map Name XObject)
dictXObjects Pdf
pdf Dict
dict
dictXObjects :: Pdf -> Dict -> IO (Map Name XObject)
dictXObjects :: Pdf -> Dict -> IO (Map Name XObject)
dictXObjects Pdf
pdf Dict
dict =
case Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"Resources" Dict
dict of
Maybe Object
Nothing -> Map Name XObject -> IO (Map Name XObject)
forall (m :: * -> *) a. Monad m => a -> m a
return Map Name XObject
forall k a. Map k a
Map.empty
Just Object
res -> do
Dict
resDict <- do
Object
v <- Pdf -> Object -> IO Object
deref Pdf
pdf Object
res
Either String Dict -> IO Dict
forall a. Either String a -> IO a
sure (Either String Dict -> IO Dict) -> Either String Dict -> IO Dict
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Dict
dictValue Object
v
Maybe Dict -> String -> Either String Dict
forall a. Maybe a -> String -> Either String a
`notice` String
"Resources should be a dict"
case Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"XObject" Dict
resDict of
Maybe Object
Nothing -> Map Name XObject -> IO (Map Name XObject)
forall (m :: * -> *) a. Monad m => a -> m a
return Map Name XObject
forall k a. Map k a
Map.empty
Just Object
xo -> do
Dict
xosDict <- do
Object
v <- Pdf -> Object -> IO Object
deref Pdf
pdf Object
xo
Either String Dict -> IO Dict
forall a. Either String a -> IO a
sure (Either String Dict -> IO Dict) -> Either String Dict -> IO Dict
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Dict
dictValue Object
v
Maybe Dict -> String -> Either String Dict
forall a. Maybe a -> String -> Either String a
`notice` String
"XObject should be a dict"
[(Name, Maybe XObject)]
result <- [(Name, Object)]
-> ((Name, Object) -> IO (Name, Maybe XObject))
-> IO [(Name, Maybe XObject)]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM (Dict -> [(Name, Object)]
forall k v. HashMap k v -> [(k, v)]
HashMap.toList Dict
xosDict) (((Name, Object) -> IO (Name, Maybe XObject))
-> IO [(Name, Maybe XObject)])
-> ((Name, Object) -> IO (Name, Maybe XObject))
-> IO [(Name, Maybe XObject)]
forall a b. (a -> b) -> a -> b
$ \(Name
name, Object
o) -> do
Ref
ref <- Either String Ref -> IO Ref
forall a. Either String a -> IO a
sure (Either String Ref -> IO Ref) -> Either String Ref -> IO Ref
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Ref
refValue Object
o
Maybe Ref -> String -> Either String Ref
forall a. Maybe a -> String -> Either String a
`notice` String
"Not a ref"
s :: Stream
s@(S Dict
xoDict Int64
_) <- do
Object
v <- Pdf -> Ref -> IO Object
lookupObject Pdf
pdf Ref
ref
Either String Stream -> IO Stream
forall a. Either String a -> IO a
sure (Either String Stream -> IO Stream)
-> Either String Stream -> IO Stream
forall a b. (a -> b) -> a -> b
$ Object -> Maybe Stream
streamValue Object
v
Maybe Stream -> String -> Either String Stream
forall a. Maybe a -> String -> Either String a
`notice` String
"Not a stream"
case Name -> Dict -> Maybe Object
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Name
"Subtype" Dict
xoDict of
Just (Name Name
"Form") -> do
InputStream ByteString
is <- Pdf -> Ref -> Stream -> IO (InputStream ByteString)
streamContent Pdf
pdf Ref
ref Stream
s
ByteString
cont <- [ByteString] -> ByteString
Lazy.ByteString.fromChunks ([ByteString] -> ByteString) -> IO [ByteString] -> IO ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> InputStream ByteString -> IO [ByteString]
forall a. InputStream a -> IO [a]
Streams.toList InputStream ByteString
is
Map Name FontDict
fontDicts <- [(Name, FontDict)] -> Map Name FontDict
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(Name, FontDict)] -> Map Name FontDict)
-> IO [(Name, FontDict)] -> IO (Map Name FontDict)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
Page -> IO [(Name, FontDict)]
pageFontDicts (Pdf -> Ref -> Dict -> Page
Page Pdf
pdf Ref
ref Dict
xoDict)
Map Name (ByteString -> [(Glyph, Double)])
glyphDecoders <- Map Name FontDict
-> (FontDict -> IO (ByteString -> [(Glyph, Double)]))
-> IO (Map Name (ByteString -> [(Glyph, Double)]))
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
Traversable.forM Map Name FontDict
fontDicts ((FontDict -> IO (ByteString -> [(Glyph, Double)]))
-> IO (Map Name (ByteString -> [(Glyph, Double)])))
-> (FontDict -> IO (ByteString -> [(Glyph, Double)]))
-> IO (Map Name (ByteString -> [(Glyph, Double)]))
forall a b. (a -> b) -> a -> b
$ \FontDict
fontDict ->
FontInfo -> ByteString -> [(Glyph, Double)]
fontInfoDecodeGlyphs (FontInfo -> ByteString -> [(Glyph, Double)])
-> IO FontInfo -> IO (ByteString -> [(Glyph, Double)])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FontDict -> IO FontInfo
fontDictLoadInfo FontDict
fontDict
let glyphDecoder :: GlyphDecoder
glyphDecoder Name
fontName = \ByteString
str ->
case Name
-> Map Name (ByteString -> [(Glyph, Double)])
-> Maybe (ByteString -> [(Glyph, Double)])
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Name
fontName Map Name (ByteString -> [(Glyph, Double)])
glyphDecoders of
Maybe (ByteString -> [(Glyph, Double)])
Nothing -> []
Just ByteString -> [(Glyph, Double)]
decode -> ByteString -> [(Glyph, Double)]
decode ByteString
str
Map Name XObject
children <- Pdf -> Dict -> IO (Map Name XObject)
dictXObjects Pdf
pdf Dict
xoDict
let xobj :: XObject
xobj = XObject :: ByteString -> GlyphDecoder -> Map Name XObject -> XObject
XObject
{ xobjectContent :: ByteString
xobjectContent = ByteString
cont
, xobjectGlyphDecoder :: GlyphDecoder
xobjectGlyphDecoder = GlyphDecoder
glyphDecoder
, xobjectChildren :: Map Name XObject
xobjectChildren = Map Name XObject
children
}
(Name, Maybe XObject) -> IO (Name, Maybe XObject)
forall (m :: * -> *) a. Monad m => a -> m a
return (Name
name, XObject -> Maybe XObject
forall a. a -> Maybe a
Just XObject
xobj)
Maybe Object
_ -> (Name, Maybe XObject) -> IO (Name, Maybe XObject)
forall (m :: * -> *) a. Monad m => a -> m a
return (Name
name, Maybe XObject
forall a. Maybe a
Nothing)
Map Name XObject -> IO (Map Name XObject)
forall (m :: * -> *) a. Monad m => a -> m a
return (Map Name XObject -> IO (Map Name XObject))
-> Map Name XObject -> IO (Map Name XObject)
forall a b. (a -> b) -> a -> b
$ [(Name, XObject)] -> Map Name XObject
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(Name, XObject)] -> Map Name XObject)
-> [(Name, XObject)] -> Map Name XObject
forall a b. (a -> b) -> a -> b
$ (((Name, Maybe XObject) -> Maybe (Name, XObject))
-> [(Name, Maybe XObject)] -> [(Name, XObject)])
-> [(Name, Maybe XObject)]
-> ((Name, Maybe XObject) -> Maybe (Name, XObject))
-> [(Name, XObject)]
forall a b c. (a -> b -> c) -> b -> a -> c
flip ((Name, Maybe XObject) -> Maybe (Name, XObject))
-> [(Name, Maybe XObject)] -> [(Name, XObject)]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe [(Name, Maybe XObject)]
result (((Name, Maybe XObject) -> Maybe (Name, XObject))
-> [(Name, XObject)])
-> ((Name, Maybe XObject) -> Maybe (Name, XObject))
-> [(Name, XObject)]
forall a b. (a -> b) -> a -> b
$ \(Name
n, Maybe XObject
mo) -> do
XObject
o <- Maybe XObject
mo
(Name, XObject) -> Maybe (Name, XObject)
forall (m :: * -> *) a. Monad m => a -> m a
return (Name
n, XObject
o)
pageExtractText :: Page -> IO Text
Page
page = [Span] -> Text
glyphsToText ([Span] -> Text) -> IO [Span] -> IO Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Page -> IO [Span]
pageExtractGlyphs Page
page
pageExtractGlyphs :: Page -> IO [Span]
Page
page = do
Map Name FontDict
fontDicts <- [(Name, FontDict)] -> Map Name FontDict
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(Name, FontDict)] -> Map Name FontDict)
-> IO [(Name, FontDict)] -> IO (Map Name FontDict)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Page -> IO [(Name, FontDict)]
pageFontDicts Page
page
Map Name (ByteString -> [(Glyph, Double)])
glyphDecoders <- Map Name FontDict
-> (FontDict -> IO (ByteString -> [(Glyph, Double)]))
-> IO (Map Name (ByteString -> [(Glyph, Double)]))
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
Traversable.forM Map Name FontDict
fontDicts ((FontDict -> IO (ByteString -> [(Glyph, Double)]))
-> IO (Map Name (ByteString -> [(Glyph, Double)])))
-> (FontDict -> IO (ByteString -> [(Glyph, Double)]))
-> IO (Map Name (ByteString -> [(Glyph, Double)]))
forall a b. (a -> b) -> a -> b
$ \FontDict
fontDict ->
FontInfo -> ByteString -> [(Glyph, Double)]
fontInfoDecodeGlyphs (FontInfo -> ByteString -> [(Glyph, Double)])
-> IO FontInfo -> IO (ByteString -> [(Glyph, Double)])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FontDict -> IO FontInfo
fontDictLoadInfo FontDict
fontDict
let glyphDecoder :: GlyphDecoder
glyphDecoder Name
fontName = \ByteString
str ->
case Name
-> Map Name (ByteString -> [(Glyph, Double)])
-> Maybe (ByteString -> [(Glyph, Double)])
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Name
fontName Map Name (ByteString -> [(Glyph, Double)])
glyphDecoders of
Maybe (ByteString -> [(Glyph, Double)])
Nothing -> []
Just ByteString -> [(Glyph, Double)]
decode -> ByteString -> [(Glyph, Double)]
decode ByteString
str
Map Name XObject
xobjects <- Page -> IO (Map Name XObject)
pageXObjects Page
page
InputStream Expr
is <- do
[Ref]
contents <- Page -> IO [Ref]
pageContents Page
page
let Page Pdf
pdf Ref
_ Dict
_ = Page
page
InputStream ByteString
is <- Pdf -> [Ref] -> IO (InputStream ByteString)
combinedContent Pdf
pdf [Ref]
contents
Parser (Maybe Expr)
-> InputStream ByteString -> IO (InputStream Expr)
forall r.
Parser (Maybe r) -> InputStream ByteString -> IO (InputStream r)
Streams.parserToInputStream Parser (Maybe Expr)
parseContent InputStream ByteString
is
let loop :: Map Name XObject -> InputStream Expr -> Processor -> IO Processor
loop Map Name XObject
xobjs InputStream Expr
s Processor
p = do
Maybe Operator
next <- InputStream Expr -> IO (Maybe Operator)
readNextOperator InputStream Expr
s
case Maybe Operator
next of
Just (Op
Op_Do, [Name Name
name]) -> Map Name XObject -> Name -> Processor -> IO Processor
processDo Map Name XObject
xobjs Name
name Processor
p IO Processor -> (Processor -> IO Processor) -> IO Processor
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Map Name XObject -> InputStream Expr -> Processor -> IO Processor
loop Map Name XObject
xobjs InputStream Expr
s
Just Operator
op ->
case Operator -> Processor -> Either String Processor
processOp Operator
op Processor
p of
Left String
err -> Unexpected -> IO Processor
forall e a. Exception e => e -> IO a
throwIO (String -> [String] -> Unexpected
Unexpected String
err [])
Right Processor
p' -> Map Name XObject -> InputStream Expr -> Processor -> IO Processor
loop Map Name XObject
xobjs InputStream Expr
s Processor
p'
Maybe Operator
Nothing -> Processor -> IO Processor
forall (m :: * -> *) a. Monad m => a -> m a
return Processor
p
processDo :: Map Name XObject -> Name -> Processor -> IO Processor
processDo Map Name XObject
xobjs Name
name Processor
p = do
case Name -> Map Name XObject -> Maybe XObject
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Name
name Map Name XObject
xobjs of
Maybe XObject
Nothing -> Processor -> IO Processor
forall (m :: * -> *) a. Monad m => a -> m a
return Processor
p
Just XObject
xobj -> do
InputStream Expr
s <- do
InputStream ByteString
s <- ByteString -> IO (InputStream ByteString)
Streams.fromLazyByteString (XObject -> ByteString
xobjectContent XObject
xobj)
Parser (Maybe Expr)
-> InputStream ByteString -> IO (InputStream Expr)
forall r.
Parser (Maybe r) -> InputStream ByteString -> IO (InputStream r)
Streams.parserToInputStream Parser (Maybe Expr)
parseContent InputStream ByteString
s
let gdec' :: GlyphDecoder
gdec' = Processor -> GlyphDecoder
prGlyphDecoder Processor
p
Processor
p' <- Map Name XObject -> InputStream Expr -> Processor -> IO Processor
loop (XObject -> Map Name XObject
xobjectChildren XObject
xobj) InputStream Expr
s
(Processor
p {prGlyphDecoder :: GlyphDecoder
prGlyphDecoder = XObject -> GlyphDecoder
xobjectGlyphDecoder XObject
xobj})
Processor -> IO Processor
forall (m :: * -> *) a. Monad m => a -> m a
return (Processor
p' {prGlyphDecoder :: GlyphDecoder
prGlyphDecoder = GlyphDecoder
gdec'})
Processor
p <- Map Name XObject -> InputStream Expr -> Processor -> IO Processor
loop Map Name XObject
xobjects InputStream Expr
is (Processor -> IO Processor) -> Processor -> IO Processor
forall a b. (a -> b) -> a -> b
$ Processor
mkProcessor {
prGlyphDecoder :: GlyphDecoder
prGlyphDecoder = GlyphDecoder
glyphDecoder
}
[Span] -> IO [Span]
forall (m :: * -> *) a. Monad m => a -> m a
return ([Span] -> [Span]
forall a. [a] -> [a]
List.reverse (Processor -> [Span]
prSpans Processor
p))
combinedContent :: Pdf -> [Ref] -> IO (InputStream ByteString)
combinedContent :: Pdf -> [Ref] -> IO (InputStream ByteString)
combinedContent Pdf
pdf [Ref]
refs = do
[(Ref, Stream)]
allStreams <- [Ref] -> (Ref -> IO (Ref, Stream)) -> IO [(Ref, Stream)]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM [Ref]
refs ((Ref -> IO (Ref, Stream)) -> IO [(Ref, Stream)])
-> (Ref -> IO (Ref, Stream)) -> IO [(Ref, Stream)]
forall a b. (a -> b) -> a -> b
$ \Ref
ref -> do
Object
o <- Pdf -> Ref -> IO Object
lookupObject Pdf
pdf Ref
ref
case Object
o of
Stream Stream
s -> (Ref, Stream) -> IO (Ref, Stream)
forall (m :: * -> *) a. Monad m => a -> m a
return (Ref
ref, Stream
s)
Object
_ -> Corrupted -> IO (Ref, Stream)
forall e a. Exception e => e -> IO a
throwIO (String -> [String] -> Corrupted
Corrupted String
"Page content is not a stream" [])
Generator ByteString () -> IO (InputStream ByteString)
forall r a. Generator r a -> IO (InputStream r)
Streams.fromGenerator (Generator ByteString () -> IO (InputStream ByteString))
-> Generator ByteString () -> IO (InputStream ByteString)
forall a b. (a -> b) -> a -> b
$ [(Ref, Stream)]
-> ((Ref, Stream) -> Generator ByteString ())
-> Generator ByteString ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [(Ref, Stream)]
allStreams (((Ref, Stream) -> Generator ByteString ())
-> Generator ByteString ())
-> ((Ref, Stream) -> Generator ByteString ())
-> Generator ByteString ()
forall a b. (a -> b) -> a -> b
$ \(Ref
ref, Stream
stream) -> do
InputStream ByteString
is <- IO (InputStream ByteString)
-> Generator ByteString (InputStream ByteString)
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO (InputStream ByteString)
-> Generator ByteString (InputStream ByteString))
-> IO (InputStream ByteString)
-> Generator ByteString (InputStream ByteString)
forall a b. (a -> b) -> a -> b
$ Pdf -> Ref -> Stream -> IO (InputStream ByteString)
streamContent Pdf
pdf Ref
ref Stream
stream
InputStream ByteString -> Generator ByteString ()
forall r. InputStream r -> Generator r ()
yield InputStream ByteString
is
where
yield :: InputStream r -> Generator r ()
yield InputStream r
is = do
Maybe r
chunk <- IO (Maybe r) -> Generator r (Maybe r)
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO (Maybe r) -> Generator r (Maybe r))
-> IO (Maybe r) -> Generator r (Maybe r)
forall a b. (a -> b) -> a -> b
$ InputStream r -> IO (Maybe r)
forall a. InputStream a -> IO (Maybe a)
Streams.read InputStream r
is
case Maybe r
chunk of
Maybe r
Nothing -> () -> Generator r ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
Just r
c -> do
r -> Generator r ()
forall r. r -> Generator r ()
Streams.yield r
c
InputStream r -> Generator r ()
yield InputStream r
is
glyphsToText :: [Span] -> Text
glyphsToText :: [Span] -> Text
glyphsToText
= Text -> Text
Lazy.Text.toStrict
(Text -> Text) -> ([Span] -> Text) -> [Span] -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Builder -> Text
Text.Builder.toLazyText
(Builder -> Text) -> ([Span] -> Builder) -> [Span] -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Vector Double, Bool), Builder) -> Builder
forall a b. (a, b) -> b
snd
(((Vector Double, Bool), Builder) -> Builder)
-> ([Span] -> ((Vector Double, Bool), Builder))
-> [Span]
-> Builder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (((Vector Double, Bool), Builder)
-> [Glyph] -> ((Vector Double, Bool), Builder))
-> ((Vector Double, Bool), Builder)
-> [[Glyph]]
-> ((Vector Double, Bool), Builder)
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl ((Vector Double, Bool), Builder)
-> [Glyph] -> ((Vector Double, Bool), Builder)
step ((Double -> Double -> Vector Double
forall a. a -> a -> Vector a
Vector Double
0 Double
0, Bool
False), Builder
forall a. Monoid a => a
mempty)
([[Glyph]] -> ((Vector Double, Bool), Builder))
-> ([Span] -> [[Glyph]])
-> [Span]
-> ((Vector Double, Bool), Builder)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Span -> [Glyph]) -> [Span] -> [[Glyph]]
forall a b. (a -> b) -> [a] -> [b]
List.map Span -> [Glyph]
spGlyphs
where
step :: ((Vector Double, Bool), Builder)
-> [Glyph] -> ((Vector Double, Bool), Builder)
step ((Vector Double, Bool), Builder)
acc [] = ((Vector Double, Bool), Builder)
acc
step ((Vector Double
lx2 Double
ly2, Bool
wasSpace), Builder
res) [Glyph]
sp =
let Vector Double
x1 Double
y1 = Glyph -> Vector Double
glyphTopLeft ([Glyph] -> Glyph
forall a. [a] -> a
head [Glyph]
sp)
Vector Double
x2 Double
_ = Glyph -> Vector Double
glyphBottomRight ([Glyph] -> Glyph
forall a. [a] -> a
last [Glyph]
sp)
Vector Double
_ Double
y2 = Glyph -> Vector Double
glyphTopLeft ([Glyph] -> Glyph
forall a. [a] -> a
last [Glyph]
sp)
space :: Builder
space =
if Double -> Double
forall a. Num a => a -> a
abs (Double
ly2 Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
y1) Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
1.8
then if Bool
wasSpace Bool -> Bool -> Bool
|| Double -> Double
forall a. Num a => a -> a
abs (Double
lx2 Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
x1) Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
1.8
then Builder
forall a. Monoid a => a
mempty
else Char -> Builder
Text.Builder.singleton Char
' '
else Char -> Builder
Text.Builder.singleton Char
'\n'
txt :: Builder
txt = Text -> Builder
Text.Builder.fromLazyText (Text -> Builder) -> Text -> Builder
forall a b. (a -> b) -> a -> b
$ [Text] -> Text
Lazy.Text.fromChunks ([Text] -> Text) -> [Text] -> Text
forall a b. (a -> b) -> a -> b
$ (Glyph -> Maybe Text) -> [Glyph] -> [Text]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe Glyph -> Maybe Text
glyphText [Glyph]
sp
endWithSpace :: Bool
endWithSpace = Glyph -> Maybe Text
glyphText ([Glyph] -> Glyph
forall a. [a] -> a
last [Glyph]
sp) Maybe Text -> Maybe Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text -> Maybe Text
forall a. a -> Maybe a
Just Text
" "
in ((Double -> Double -> Vector Double
forall a. a -> a -> Vector a
Vector Double
x2 Double
y2, Bool
endWithSpace), [Builder] -> Builder
forall a. Monoid a => [a] -> a
mconcat [Builder
res, Builder
space, Builder
txt])