-- Copyright: 2010, 2011 Dino Morelli -- License: BSD3 (see LICENSE) -- Author: Dino Morelli {-# LANGUAGE FlexibleContexts #-} {- | Functions for doing some disk IO with ePub documents Note that these functions do their work by using the external unzip utility. -} module Codec.Epub.IO ( extractFileFromZip, opfPath ) where import Control.Arrow.ListArrows ( (>>>), deep ) import Control.Monad.Error import System.Exit import System.Process import Text.Printf import Text.Regex import Text.XML.HXT.Arrow.XmlArrow ( getAttrValue, hasName, isElem ) import Text.XML.HXT.Arrow.XmlState ( no, runX, withValidate ) import Text.XML.HXT.Arrow.ReadDocument ( readString ) -- | An evil hack to remove encoding from the document removeEncoding :: String -> String removeEncoding = flip (subRegex (mkRegexWithOpts " +encoding=\"UTF-8\"" False True)) "" -- | An evil hack to remove any from the document removeDoctype :: String -> String removeDoctype = flip (subRegex (mkRegexWithOpts "]*>" False True)) "" {- | GNU unzip has annoying non-zero exit codes that aren't fatal so we need to check for those special. -} handleEC :: (MonadIO m, MonadError String m) => String -> ExitCode -> m () handleEC msg (ExitFailure c) | c > 2 = throwError $ printf "%s status: %s]\n" msg (show c) | otherwise = return () handleEC _ ExitSuccess = return () {- | Extract a file from a zipfile. This is here because ePub files are really just zip files. Yep, you saw right sports fans. This code is using the command- line unzip utility. In the future I'd like to make it use a library. -} extractFileFromZip :: (MonadIO m, MonadError String m) => FilePath -- ^ path to zip file -> FilePath -- ^ path within zip file to extract -> m String -- ^ contents of expected file extractFileFromZip zipPath filePath = do let dearchiver = "unzip" (ec, output, _) <- liftIO $ readProcessWithExitCode dearchiver ["-p", zipPath, filePath] "" handleEC (printf "[ERROR %s zip file: %s path in zip: %s" dearchiver zipPath filePath) ec return . removeEncoding . removeDoctype $ output -- | Get the path within an ePub file to the OPF Package Document opfPath :: (MonadError String m, MonadIO m) => FilePath -- ^ path to ePub zip file -> m String -- ^ path within ePub to the OPF Package Document opfPath zipPath = do containerContents <- extractFileFromZip zipPath "META-INF/container.xml" result <- liftIO $ runX ( readString [withValidate no] containerContents >>> deep (isElem >>> hasName "rootfile") >>> getAttrValue "full-path" ) case result of (p : []) -> return p _ -> throwError "ERROR: rootfile full-path missing from META-INF/container.xml"