module Codec.Archive.Zim.Parser
(
getHeader
, getMimeList
, getDE
, getMainPageUrl
, getCluster
, getBlob
, getContent
, searchDE
, MimeList
, mkNsTitle
, mkNsTitlePrefix
, mkNsUrl
, RunZim
, ZimGetDE
, ZimSearchDE
, ZimGetContent
, ZimException(..)
, ZimHeader(..)
, ZimDirEntType(..)
, ZimDirEnt(..)
, UrlIndex(..)
, TitleIndex(..)
, ClusterNumber(..)
, BlobNumber(..)
, Cluster(..)
, Blob(..)
, Url(..)
, Title
, TitlePrefix
) where
import Control.Applicative ((<$>), (<*>))
import Control.Exception (Exception, throw)
import Control.Monad (when)
import Data.Char (chr)
import Data.Maybe (fromJust)
import Data.Typeable (Typeable)
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as B8
import qualified Data.ByteString.Lazy as BL
import System.IO (Handle, IOMode(ReadMode), withBinaryFile)
import Data.Conduit (($$), (=$), await, Sink)
import Control.Monad.Trans.Resource (runResourceT)
import Data.Conduit.Binary (sourceHandleRange, sourceLbs, sinkLbs)
import Data.Conduit.Serialization.Binary (sinkGet, conduitGet)
import Data.Conduit.Lzma (decompress)
import Data.Array.IArray ((!), listArray, Array)
import Data.Binary.Get (Get, skip, getWord8, getWord16le, getWord32le, getWord64le, getByteString, getLazyByteStringNul, getRemainingLazyByteString)
import Numeric (showHex)
data ZimException = ZimInvalidMagic
| ZimParseError String
| ZimIncompleteInput
| ZimInvalidIndex Int
deriving (Show, Typeable)
instance Exception ZimException
data ZimHeader = ZimHeader
{
zimMagicNumber :: Int
, zimVersion :: Int
, zimUuid :: B.ByteString
, zimArticleCount :: Int
, zimClusterCount :: Int
, zimUrlPtrPos :: Integer
, zimTitlePtrPos :: Integer
, zimClusterPtrPos :: Integer
, zimMimeListPos :: Integer
, zimMainPage :: Maybe Int
, zimLayoutPage :: Maybe Int
, zimChecksumPos :: Integer
} deriving (Show, Eq)
data ZimDirEntType = ZimArticleEntry
| ZimRedirectEntry
| ZimLinkTarget
| ZimDeletedEntry
deriving (Eq, Show)
data ZimDirEnt = ZimDirEnt
{
zimDeType :: ZimDirEntType
, zimDeMimeType :: Int
, zimDeParameterLen :: Int
, zimDeNamespace :: Char
, zimDeRevision :: Int
, zimDeRedirectIndex :: Maybe Int
, zimDeClusterNumber :: Maybe Int
, zimDeBlobNumber :: Maybe Int
, zimDeUrl :: B8.ByteString
, zimDeTitle :: B8.ByteString
} deriving (Eq, Show)
type MimeList = Array Int B8.ByteString
newtype UrlIndex = UrlIndex Int deriving (Eq, Ord, Show)
newtype TitleIndex = TitleIndex Int deriving (Eq, Ord, Show)
newtype ClusterNumber = ClusterNumber Int deriving (Eq, Ord, Show)
newtype BlobNumber = BlobNumber Int deriving (Eq, Ord, Show)
newtype Url = Url B.ByteString deriving (Eq, Ord, Show)
mkNsUrl :: Char -> B.ByteString -> Url
mkNsUrl c s = Url $ c `B8.cons` '/' `B8.cons` s
newtype Title = Title B.ByteString deriving (Eq, Ord, Show)
mkNsTitle :: Char -> B.ByteString -> Title
mkNsTitle c s = Title $ c `B8.cons` '/' `B8.cons` s
newtype TitlePrefix = TitlePrefix B.ByteString deriving (Eq, Ord, Show)
mkNsTitlePrefix :: Char -> B.ByteString -> TitlePrefix
mkNsTitlePrefix c s = TitlePrefix $ c `B8.cons` '/' `B8.cons` s
newtype Cluster = Cluster {unCluster :: BL.ByteString}
newtype Blob = Blob {unBlob :: BL.ByteString}
parseZimHeader :: Get ZimHeader
parseZimHeader = do
magicNumber <- fromIntegral <$> getWord32le
when (magicNumber /= 72173914) $ throw ZimInvalidMagic
version <- fromIntegral <$> getWord32le
uuid <- getByteString 16
articleCount <- fromIntegral <$> getWord32le
clusterCount <- fromIntegral <$> getWord32le
urlPtrPos <- fromIntegral <$> getWord64le
titlePtrPos <- fromIntegral <$> getWord64le
clusterPtrPos <- fromIntegral <$> getWord64le
mimeListPos <- fromIntegral <$> getWord64le
mainPage <- fromIntegral <$> getWord32le
layoutPage <- fromIntegral <$> getWord32le
checksumPos <- fromIntegral <$> getWord64le
return $ ZimHeader magicNumber version uuid articleCount clusterCount
urlPtrPos titlePtrPos clusterPtrPos mimeListPos
(if mainPage == 0xffffffff then Nothing else Just mainPage)
(if layoutPage == 0xffffffff then Nothing else Just layoutPage)
checksumPos
class RunZim h where
runZim :: h -> (Handle -> ZimHeader -> IO a) -> IO a
instance RunZim Handle where
runZim hdl f = do
hdr <- src $$ sinkGet parseZimHeader
f hdl hdr
where (pos, len) = (Just 0, Just 80)
src = sourceHandleRange hdl pos len
instance RunZim (Handle, ZimHeader) where
runZim x f = uncurry f x
instance RunZim FilePath where
runZim fp f = withBinaryFile fp ReadMode $ \hdl -> runZim hdl f
getHeader :: RunZim h => h -> IO ZimHeader
getHeader h = runZim h $ \_ hdr -> return hdr
parseByteStringsNul :: Sink B8.ByteString IO [B8.ByteString]
parseByteStringsNul = conduitGet getLazyByteStringNul =$ loop id
where loop :: ([B8.ByteString] -> [B8.ByteString]) -> Sink BL.ByteString IO [B8.ByteString]
loop front = await >>= maybe
(return $ front [])
(\x -> let bs = BL.toStrict x
in if B8.null bs then return (front []) else loop (front . (bs:))
)
getMimeList :: RunZim h => h -> IO MimeList
getMimeList h = runZim h $ \hdl hdr -> do
let (pos, len) = (Just $ zimMimeListPos hdr, Nothing)
src = sourceHandleRange hdl pos len
mimeList <- src $$ parseByteStringsNul
return $ listArray (0, length mimeList) mimeList
parseZimDirEnt :: Get ZimDirEnt
parseZimDirEnt = do
mimeType <- fromIntegral <$> getWord16le :: Get Int
parmLen <- fromIntegral <$> getWord8
namespace <- chr . fromIntegral <$> getWord8
revision <- fromIntegral <$> getWord32le
let deType = case mimeType of
0xffff -> ZimRedirectEntry
0xfffe -> ZimLinkTarget
0xfffd -> ZimDeletedEntry
_ -> ZimArticleEntry
(redirectIndex, clusterNumber, blobNumber ) <-
case deType of
ZimArticleEntry ->
(\x y -> (Nothing, Just $ fromIntegral x, Just $ fromIntegral y))
<$> getWord32le <*> getWord32le
ZimRedirectEntry ->
(\x -> (Just $ fromIntegral x, Nothing, Nothing))
<$> getWord32le
ZimLinkTarget -> skip 8 >> return (Nothing, Nothing, Nothing)
ZimDeletedEntry -> skip 8 >> return (Nothing, Nothing, Nothing)
:: Get (Maybe Int, Maybe Int, Maybe Int)
url <- BL.toStrict <$> getLazyByteStringNul
title <- BL.toStrict <$> getLazyByteStringNul
return $ ZimDirEnt deType mimeType parmLen namespace revision redirectIndex
clusterNumber blobNumber url
(if B.null title then url else title)
class ZimGetDE k where
getDE :: RunZim h => h -> k -> IO ZimDirEnt
instance ZimGetDE UrlIndex where
getDE h (UrlIndex i) = runZim h $ \hdl hdr -> do
let urlPtrPos = Just $ zimUrlPtrPos hdr + 8 * fromIntegral i
when (i < 0 || i >= zimArticleCount hdr) . throw $ ZimInvalidIndex i
dePos <- sourceHandleRange hdl urlPtrPos Nothing $$ sinkGet getWord64le
let srcDirEnt = sourceHandleRange hdl (Just $ fromIntegral dePos) Nothing
srcDirEnt $$ sinkGet parseZimDirEnt
instance ZimGetDE TitleIndex where
getDE h (TitleIndex i) = runZim h $ \hdl hdr -> do
let titlePtrPos = Just $ zimTitlePtrPos hdr + 4 * fromIntegral i
srcTitle = sourceHandleRange hdl titlePtrPos Nothing
when (i < 0 || i >= zimArticleCount hdr) . throw $ ZimInvalidIndex i
urlIndex <- srcTitle $$ sinkGet getWord32le
(hdl, hdr) `getDE` (UrlIndex $ fromIntegral urlIndex)
getCluster :: RunZim h => h -> ClusterNumber -> IO Cluster
getCluster h (ClusterNumber i) = runZim h $ \hdl hdr -> do
let limit = zimClusterCount hdr 1
when (i < 0 || i > limit) . throw $ ZimInvalidIndex i
let clusterPos = Just $ zimClusterPtrPos hdr + 8 * fromIntegral i
src = sourceHandleRange hdl clusterPos Nothing
(pos0, pos1) <- src $$ sinkGet $ (,) <$> getWord64le <*> getWord64le
let len = if i == limit
then fromIntegral (zimChecksumPos hdr) pos0
else pos1 pos0
toI = Just . fromIntegral
srcCluster = sourceHandleRange hdl (toI pos0) (toI len)
bs <- srcCluster $$ sinkGet getRemainingLazyByteString
case BL.uncons bs of
Just (0, cluster) -> return $ Cluster cluster
Just (1, cluster) -> return $ Cluster cluster
Just (4, cluster) ->
Cluster <$> (runResourceT $ sourceLbs cluster $$ decompress Nothing =$ sinkLbs)
Just (x, _) -> throw . ZimParseError $
"Cluster " ++ show i ++
" (offset: " ++ showHex pos0 "" ++ ", length: " ++ show len ++
") compressed with unsupported type: " ++ show x
Nothing -> throw . ZimParseError $
"Insufficient bytes for cluster " ++ show i
getBlob :: RunZim h => h -> (ClusterNumber, BlobNumber) -> IO Blob
getBlob h (c, BlobNumber b) = do
Cluster cluster <- h `getCluster` c
let src = sourceLbs (BL.drop (4 * fromIntegral b) cluster)
(pos0, pos1) <- src $$ sinkGet $ (,) <$> getWord32le <*> getWord32le
let len = pos1 pos0
return . Blob . BL.take (fromIntegral len) $ BL.drop (fromIntegral pos0) cluster
getMainPageUrl :: RunZim h => h -> IO (Maybe Url)
getMainPageUrl h = runZim h $ \hdl hdr ->
case zimMainPage hdr of
Nothing -> return Nothing
Just i -> do
de <- (hdl, hdr) `getDE` UrlIndex i
return . Just $ mkNsUrl (zimDeNamespace de) (zimDeUrl de)
class ZimGetContent k where
getContent :: RunZim h => h -> k -> IO (Maybe (B.ByteString, BL.ByteString))
instance ZimGetContent (MimeList, ZimDirEnt) where
getContent h (ml, de) = runZim h $ \hdl hdr -> do
case zimDeType de of
ZimRedirectEntry ->
let u = UrlIndex . fromJust $ zimDeRedirectIndex de
in (hdl, hdr) `getDE` u >>= ((hdl, hdr) `getContent`)
ZimArticleEntry -> do
let (Just c, Just b) = (zimDeClusterNumber de, zimDeBlobNumber de)
content <- unBlob <$> (hdl, hdr) `getBlob` (ClusterNumber c, BlobNumber b)
return $ Just (ml ! zimDeMimeType de, content)
_ ->
return $ Just (ml ! zimDeMimeType de, BL.empty)
instance ZimGetContent ZimDirEnt where
getContent h de = runZim h $ \hdl hdr -> do
ml <- getMimeList (hdl, hdr)
(hdl, hdr) `getContent` (ml, de)
instance ZimGetContent (MimeList, Url) where
getContent h (ml, url) = runZim h $ \hdl hdr -> do
des <- (hdl, hdr) `searchDE` url
case des of
[] -> return Nothing
((_, de) : _) -> (hdl, hdr) `getContent` (ml, de)
instance ZimGetContent Url where
getContent h url = runZim h $ \hdl hdr -> do
ml <- getMimeList (hdl, hdr)
(hdl, hdr) `getContent` (ml, url)
instance ZimGetContent (MimeList, Title) where
getContent h (ml, title) = runZim h $ \hdl hdr -> do
des <- (hdl, hdr) `searchDE` title
case des of
[] -> return Nothing
((_, de) : _) -> (hdl, hdr) `getContent` (ml, de)
instance ZimGetContent Title where
getContent h title = runZim h $ \hdl hdr -> do
ml <- getMimeList (hdl, hdr)
(hdl, hdr) `getContent` (ml, title)
instance ZimGetContent (MimeList, UrlIndex) where
getContent h (ml, u) = runZim h $ \hdl hdr -> do
de <- (hdl, hdr) `getDE` u
(hdl, hdr) `getContent` (ml, de)
instance ZimGetContent UrlIndex where
getContent h u = runZim h $ \hdl hdr -> do
ml <- getMimeList (hdl, hdr)
(hdl, hdr) `getContent` (ml, u)
instance ZimGetContent (MimeList, TitleIndex) where
getContent h (ml, t) = runZim h $ \hdl hdr -> do
de <- (hdl, hdr) `getDE` t
(hdl, hdr) `getContent` (ml, de)
instance ZimGetContent TitleIndex where
getContent h t = runZim h $ \hdl hdr -> do
ml <- getMimeList (hdl, hdr)
(hdl, hdr) `getContent` (ml, t)
binarySearch :: (Int -> IO (Ordering, a)) -> Int -> Int -> IO (Maybe a)
binarySearch f low high =
if high < low
then return Nothing
else do
let mid = (low + high) `div` 2
(o, x) <- f mid
case o of
LT -> binarySearch f low (mid 1)
GT -> binarySearch f (mid + 1) high
EQ -> return $ Just x
class ZimSearchDE k where
searchDE :: RunZim h => h -> k -> IO [(Int, ZimDirEnt)]
instance ZimSearchDE Url where
searchDE h url = runZim h $ \hdl hdr -> do
let f i = do
de <- (hdl, hdr) `getDE` UrlIndex i
let v = mkNsUrl (zimDeNamespace de) (zimDeUrl de)
return (compare url v, (UrlIndex i, de))
res <- binarySearch f 0 (zimArticleCount hdr 1)
return $ maybe [] (\(UrlIndex i, r) -> [(i, r)]) res
instance ZimSearchDE Title where
searchDE h title = runZim h $ \hdl hdr -> do
let f i = do
de <- (hdl, hdr) `getDE` TitleIndex i
let v = mkNsTitle (zimDeNamespace de) (zimDeTitle de)
return (compare title v, (TitleIndex i, de))
res <- binarySearch f 0 (zimArticleCount hdr 1)
return $ maybe [] (\(TitleIndex i, r) -> [(i, r)]) res
instance ZimSearchDE TitlePrefix where
searchDE h (TitlePrefix pre) = runZim h $ \hdl hdr -> do
let preLen = B8.length pre 2
limit = zimArticleCount hdr 1
mkT x = mkNsTitle (zimDeNamespace x) (B8.take preLen (zimDeTitle x))
g idx = (\x -> (x, mkT x)) <$> (hdl, hdr) `getDE` idx
lowerBound i = do
de <- (hdl, hdr) `getDE` TitleIndex i
case compare (Title pre) (mkT de) of
EQ -> if i == 0
then return (EQ, (TitleIndex i, de))
else return (LT, (TitleIndex i, de))
lgt -> do
(de', Title v') <- g $ TitleIndex (i + 1)
if pre `B8.isPrefixOf` v'
then return (EQ, (TitleIndex $ i + 1, de'))
else return (lgt, (TitleIndex i, de))
upperBound i = do
de <- (hdl, hdr) `getDE` TitleIndex i
case compare (Title pre) (mkT de) of
EQ -> if i == limit
then return (EQ, (TitleIndex i, de))
else return (GT, (TitleIndex i, de))
lgt -> do
(de', Title v') <- g $ TitleIndex (i 1)
if pre `B8.isPrefixOf` v'
then return (EQ, (TitleIndex $ i 1, de'))
else return (lgt, (TitleIndex i, de))
lb <- binarySearch lowerBound 0 limit
case lb of
Nothing -> return []
_ -> do
ub <- binarySearch upperBound 0 limit
let Just (TitleIndex lbi, lb') = lb
Just (TitleIndex ubi, ub') = ub
return [(lbi, lb'), (ubi, ub')]