{-|
Module : Text.FromHTML
Description : Simple library for transformation of HTML to other formats
Copyright : (c) Marek Suchánek, 2018
License : MIT
Maintainer : marek.suchanek@fit.cvut.cz
Stability : experimental
Portability : POSIX
Simplified API for transformation of HTML to other formats with Pandoc
and wkhtmltopdf in Haskell code. It requires @wkhtmltopdf@ and @pandoc@
to be installed locally.
-}
module Text.FromHTML
( fromHTML
, ExportType(..)
) where
import qualified Data.Char as C
import qualified Data.Text as T
import qualified Data.Text.Encoding as E
import qualified Data.ByteString as B
import Control.Exception
import Data.Semigroup
import GHC.IO.Handle
import GHC.IO.Encoding
import System.Exit
import System.Process
import System.IO.Unsafe
-- | Allowed export types
data ExportType = HTML
| LaTeX
| RTF
| RST
| Markdown
| AsciiDoc
| Docx
| ODT
| DokuWiki
| MediaWiki
| EPUB2
| EPUB3
| PDF
deriving (Show, Read, Enum, Bounded, Eq)
type Input = String
type Output = B.ByteString
type Command = Input -> IO (Either Output Output)
type Process = IO (Maybe Handle, Maybe Handle, Maybe Handle, ProcessHandle)
str2BS :: String -> B.ByteString
str2BS = E.encodeUtf8 . T.pack
-- | Transform given HTML as String to selected format
fromHTML :: ExportType -> String -> Either Output Output
fromHTML HTML html = Right . str2BS $ html -- HTML is already provided!
fromHTML PDF html = makePDF html
fromHTML extp html = makePD extp html
makePDF :: Input -> Either Output Output
makePDF html = unsafePerformIO $ wkhtmltopdf html
makePD :: ExportType -> Input -> Either Output Output
makePD expt html = unsafePerformIO $ pandoc expt html
-- | Simple conversion of HTML to PDF using process wkhtmltopdf
wkhtmltopdf :: Command
wkhtmltopdf = perform cprocess
where
opts = ["--quiet", "--encoding", "utf-8", "-", "-"]
cprocess = procWith $ proc "wkhtmltopdf" opts
-- | Simple conversion of HTML to some format using process pandoc
pandoc :: ExportType -> Command
pandoc expt = perform cprocess
where
format = exportType2PD expt
opts = ["-s", "-f", "html", "-t", format, "-o", "-"]
cprocess = procWith $ proc "pandoc" opts
-- | Perform process (catched IOException)
perform :: CreateProcess -> Command
perform cprocess input = catch (performUnsafe cprocess input)
(\e -> do let err = show (e :: IOException)
return . Left $ "IOException: " <> str2BS err)
-- | Perform process (no caching exceptions)
performUnsafe :: CreateProcess -> Command
performUnsafe cprocess input = do
setLocaleEncoding utf8 -- don't know what was locales are there...
(Just stdin, Just stdout, Just stderr, p) <- createProcess cprocess
hPutStr stdin input >> hClose stdin
exitCode <- waitForProcess p
errors <- B.hGetContents stderr
output <- B.hGetContents stdout
case exitCode of
ExitSuccess -> return $ Right output
_ -> return . Left $ "Exit(" <> str2BS (show exitCode) <> "): " <> errors
procWith p = p { std_out = CreatePipe
, std_in = CreatePipe
, std_err = CreatePipe
}
exportType2PD :: ExportType -> String
exportType2PD = map C.toLower . show