module Codec.Epub.IO
( opfContents )
where
import Codec.Archive.Zip
import Control.Arrow.ListArrows ( (>>>), deep )
import Control.Monad.Error
import qualified Data.ByteString.Lazy.Char8 as B
import Text.Regex
import Text.XML.HXT.Arrow.XmlArrow ( getAttrValue, hasName, isElem )
import Text.XML.HXT.Arrow.XmlState ( no, runX, withValidate )
import Text.XML.HXT.Arrow.ReadDocument ( readString )
removeEncoding :: String -> String
removeEncoding = flip (subRegex
(mkRegexWithOpts " +encoding=\"UTF-8\"" False True)) ""
removeDoctype :: String -> String
removeDoctype = flip (subRegex
(mkRegexWithOpts "<!DOCTYPE [^>]*>" False True)) ""
fileFromArchive :: MonadError String m =>
FilePath -> Archive -> m String
fileFromArchive filePath archive = do
let mbEntry = findEntryByPath filePath archive
maybe
(throwError $ "Unable to locate file " ++ filePath)
(return . B.unpack . fromEntry) mbEntry
opfContents :: (MonadError String m, MonadIO m)
=> FilePath
-> m String
opfContents zipPath = do
zipFileBytes <- liftIO $ B.readFile zipPath
let archive = toArchive zipFileBytes
let containerPath = "META-INF/container.xml"
containerDoc <- fileFromArchive containerPath archive
result <- liftIO $ runX (
readString [withValidate no] containerDoc
>>> deep (isElem >>> hasName "rootfile")
>>> getAttrValue "full-path"
)
rootPath <- case result of
(p : []) -> return p
_ -> throwError $
"ERROR: rootfile full-path missing from " ++ containerPath
rootDoc <- fileFromArchive rootPath archive
return . removeEncoding . removeDoctype $ rootDoc