module Text.FromHTML
( fromHTML
, ExportType(..)
) where
import qualified Data.Text as T
import qualified Data.Text.Encoding as E
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Text.Pandoc as Pandoc
import qualified Text.Pandoc.Templates as PandocTemplates
import qualified Text.Pandoc.Writers as PandocWriters
import qualified Text.Pandoc.Error as PandocError
import qualified Text.Pandoc.PDF as PandocPDF
import GHC.IO.Handle
import System.Process
import System.IO.Unsafe
data ExportType = HTML
| LaTeX
| RTF
| RST
| Markdown
| AsciiDoc
| Docx
| ODT
| DokuWiki
| MediaWiki
| EPUB2
| EPUB3
| PDF
deriving (Show, Read, Enum, Bounded, Eq)
type Writer = (Pandoc.WriterOptions -> Pandoc.Pandoc -> Pandoc.PandocPure B.ByteString)
eitherToMaybe :: Show a => Either a b -> Maybe b
eitherToMaybe (Right x) = Just x
eitherToMaybe _ = Nothing
readerOptions = Pandoc.def { Pandoc.readerStandalone = True }
writerOptions = Pandoc.def
fromHTML :: ExportType -> String -> Maybe B.ByteString
fromHTML HTML html = Just . E.encodeUtf8 . T.pack $ html
fromHTML PDF html = writerHTML2PDF html
fromHTML extp html = case html2pd html of
Just pd -> eitherToMaybe . Pandoc.runPure $ runWriter extp pd
Nothing -> Nothing
runWriter :: ExportType -> Pandoc.Pandoc -> Pandoc.PandocPure B.ByteString
runWriter extp pd = do
template <- getTemplate extp
let opts = writerOptions { Pandoc.writerTemplate = template }
writer extp opts pd
getTemplate :: ExportType -> Pandoc.PandocPure (Maybe String)
getTemplate HTML = Just <$> PandocTemplates.getDefaultTemplate "html5"
getTemplate LaTeX = Just <$> PandocTemplates.getDefaultTemplate "latex"
getTemplate RTF = Just <$> PandocTemplates.getDefaultTemplate "rtf"
getTemplate RST = Just <$> PandocTemplates.getDefaultTemplate "rst"
getTemplate Markdown = Just <$> PandocTemplates.getDefaultTemplate "markdown"
getTemplate AsciiDoc = Just <$> PandocTemplates.getDefaultTemplate ""
getTemplate Docx = Just <$> PandocTemplates.getDefaultTemplate "docx"
getTemplate ODT = Just <$> PandocTemplates.getDefaultTemplate "odt"
getTemplate DokuWiki = Just <$> PandocTemplates.getDefaultTemplate "dokuwiki"
getTemplate MediaWiki = Just <$> PandocTemplates.getDefaultTemplate "mediawiki"
getTemplate EPUB2 = Just <$> PandocTemplates.getDefaultTemplate "epub2"
getTemplate EPUB3 = Just <$> PandocTemplates.getDefaultTemplate "epub3"
getTemplate _ = return Nothing
html2pd :: String -> Maybe Pandoc.Pandoc
html2pd html = eitherToMaybe . Pandoc.runPure $ Pandoc.readHtml readerOptions (T.pack html)
writerHTML2PDF :: String -> Maybe B.ByteString
writerHTML2PDF = Just . unsafePerformIO . html2pdf
html2pdf :: String -> IO B.ByteString
html2pdf html = do
(Just stdin, Just stdout, _, _) <- createProcess cprocess
hPutStr stdin html >> hClose stdin
B.hGetContents stdout
where
procWith p = p { std_out = CreatePipe
, std_in = CreatePipe
}
opts = ["--quiet", "--encoding", "utf-8", "-", "-"]
cprocess = procWith $ proc "wkhtmltopdf" opts
writer :: ExportType -> Writer
writer = wrapWriter . pandocWriter
where
wrapWriter :: Pandoc.Writer Pandoc.PandocPure -> Writer
wrapWriter (Pandoc.TextWriter tw) = \opts pd -> E.encodeUtf8 <$> tw opts pd
wrapWriter (Pandoc.ByteStringWriter bsw) = \opts pd -> BL.toStrict <$> bsw opts pd
pandocWriter :: ExportType -> Pandoc.Writer Pandoc.PandocPure
pandocWriter HTML = Pandoc.TextWriter PandocWriters.writeHtml5String
pandocWriter LaTeX = Pandoc.TextWriter PandocWriters.writeLaTeX
pandocWriter RTF = Pandoc.TextWriter PandocWriters.writeRTF
pandocWriter RST = Pandoc.TextWriter PandocWriters.writeRST
pandocWriter Markdown = Pandoc.TextWriter PandocWriters.writeMarkdown
pandocWriter AsciiDoc = Pandoc.TextWriter PandocWriters.writeAsciiDoc
pandocWriter DokuWiki = Pandoc.TextWriter PandocWriters.writeDokuWiki
pandocWriter MediaWiki = Pandoc.TextWriter PandocWriters.writeMediaWiki
pandocWriter Docx = Pandoc.ByteStringWriter PandocWriters.writeDocx
pandocWriter ODT = Pandoc.ByteStringWriter PandocWriters.writeODT
pandocWriter EPUB2 = Pandoc.ByteStringWriter PandocWriters.writeEPUB2
pandocWriter EPUB3 = Pandoc.ByteStringWriter PandocWriters.writeEPUB3
pandocWriter PDF = pandocWriter HTML