-- Copyright: 2010, 2011 Dino Morelli -- License: BSD3 (see LICENSE) -- Author: Dino Morelli {-# LANGUAGE FlexibleContexts #-} {- | Functions for doing some disk IO with ePub documents Note that these functions do their work by using the external unzip utility. -} module Codec.Epub.IO ( extractFileFromZip, opfPath ) where import Codec.Archive.LibZip import Control.Arrow.ListArrows ( (>>>), deep ) import Control.Monad.Error import Text.Regex import Text.XML.HXT.Arrow.XmlArrow ( getAttrValue, hasName, isElem ) import Text.XML.HXT.Arrow.XmlState ( no, runX, withValidate ) import Text.XML.HXT.Arrow.ReadDocument ( readString ) -- | An evil hack to remove encoding from the document removeEncoding :: String -> String removeEncoding = flip (subRegex (mkRegexWithOpts " +encoding=\"UTF-8\"" False True)) "" -- | An evil hack to remove any from the document removeDoctype :: String -> String removeDoctype = flip (subRegex (mkRegexWithOpts "]*>" False True)) "" {- | Extract a file from a zipfile. -} extractFileFromZip :: (MonadIO m, MonadError String m) => FilePath -- ^ path to zip file -> FilePath -- ^ path within zip file to extract -> m String -- ^ contents of expected file extractFileFromZip zipPath filePath = do result <- liftIO $ catchZipError (fmap Right $ withArchive [] zipPath $ fileContents [] filePath) (return . Left) output <- either (throwError . show) return result return . removeEncoding . removeDoctype $ output -- | Get the path within an ePub file to the OPF Package Document opfPath :: (MonadError String m, MonadIO m) => FilePath -- ^ path to ePub zip file -> m String -- ^ path within ePub to the OPF Package Document opfPath zipPath = do containerContents <- extractFileFromZip zipPath "META-INF/container.xml" result <- liftIO $ runX ( readString [withValidate no] containerContents >>> deep (isElem >>> hasName "rootfile") >>> getAttrValue "full-path" ) case result of (p : []) -> return p _ -> throwError "ERROR: rootfile full-path missing from META-INF/container.xml"