{-# LANGUAGE FlexibleContexts #-}
module Bio.Glf (
        GlfSeq(..),
        GlfRec(..),
        enee_glf_file,
        enum_glf_file,
        enum_glf_handle
    ) where

import Bio.Iteratee
import Bio.Iteratee.Bgzf
import Control.Monad
import Data.Bits
import System.IO

import qualified Data.ByteString.Char8  as S
import qualified Data.Iteratee.ListLike as I


data GlfRec = SNP { glf_refbase :: {-# UNPACK #-} !Char
                  , glf_offset  :: {-# UNPACK #-} !Int
                  , glf_depth   :: {-# UNPACK #-} !Int
                  , glf_min_lk  :: {-# UNPACK #-} !Int
                  , glf_mapq    :: {-# UNPACK #-} !Int
                  , glf_lk      :: [Int] }
            | Indel { glf_refbase :: {-# UNPACK #-} !Char
                    , glf_offset  :: {-# UNPACK #-} !Int
                    , glf_depth   :: {-# UNPACK #-} !Int
                    , glf_min_lk  :: {-# UNPACK #-} !Int
                    , glf_mapq    :: {-# UNPACK #-} !Int
                    , glf_lk_hom1 :: {-# UNPACK #-} !Int
                    , glf_lk_hom2 :: {-# UNPACK #-} !Int
                    , glf_lk_het  :: {-# UNPACK #-} !Int
                    , glf_is_ins1 :: !Bool
                    , glf_is_ins2 :: !Bool
                    , glf_seq1    :: {-# UNPACK #-} !S.ByteString
                    , glf_seq2    :: {-# UNPACK #-} !S.ByteString }
    deriving Show

data GlfSeq = GlfSeq { glf_seqname :: {-# UNPACK #-} !S.ByteString
                     , glf_seqlen  :: {-# UNPACK #-} !Int }
    deriving Show


enee_glf_recs :: Monad m => Enumeratee S.ByteString [GlfRec] m b
enee_glf_recs = eneeCheckIfDone step
  where
    step  oit'       = I.isFinished >>= step' oit'

    step' oit'  True = return $ liftI oit'
    step' oit' False = do
        type_ref <- I.head
        let refbase = "XACMGRSVTWYHKDBN" !! fromIntegral (type_ref .&. 0xf)
        case type_ref `shiftR` 4 of
                0 -> return $ oit' $ EOF Nothing
                1 -> do r <- get_snp $ get_common (SNP refbase)
                        eneeCheckIfDone step . oit' $ Chunk [r]
                2 -> do r <- get_indel $ get_common (Indel refbase)
                        eneeCheckIfDone step . oit' $ Chunk [r]
                x -> fail $ "unknown GLF record #" ++ show x

    get_common f = return f
        `ap` (fromIntegral `liftM` endianRead4 LSB)
        `ap` (fromIntegral `liftM` endianRead3 LSB)
        `ap` (fromIntegral `liftM` I.head)
        `ap` (fromIntegral `liftM` I.head)

    get_snp f = f `ap` get_lk_arr
    get_lk_arr = replicateM 10 (fromIntegral `liftM` I.head)

    get_indel f = do
        f' <- f `ap` (fromIntegral `liftM` I.head)
                `ap` (fromIntegral `liftM` I.head)
                `ap` (fromIntegral `liftM` I.head)
        l1 <- getInt16le
        l2 <- getInt16le
        liftM2 (f' (l1 >= 0) (l2 >= 0)) (iGetString (abs l1)) (iGetString (abs l2))

    getInt16le = do i <- endianRead2 LSB
                    return $ if i > 0x7fff then fromIntegral i - 0x10000
                                           else fromIntegral i

enee_glf_seq :: Monad m => (GlfSeq -> Enumeratee [GlfRec] a m b) -> Enumeratee S.ByteString a m b
enee_glf_seq per_seq oit = do l <- endianRead4 LSB
                              s <- liftM2 GlfSeq (S.init `liftM` iGetString (fromIntegral l))
                                                 (fromIntegral `liftM` endianRead4 LSB)
                              enee_glf_recs ><> per_seq s $ oit

-- | Iterates over a GLF file.  In @get_glf_file per_seq per_file@, the
-- enumerator @per_file genome_name@, where @genome_name@ is the name
-- stored in the GLF header, is run once, then the enumeratee @per_seq
-- glfseq@ is iterated over the records in each sequence.
enee_glf_file :: Monad m => (GlfSeq -> Enumeratee [GlfRec] a m b)
                         -> (S.ByteString -> Enumerator a m b)
                         -> Enumeratee S.ByteString a m b
enee_glf_file per_seq per_file oit = do
    matched <- I.heads (S.pack "GLF\003")
    when (matched /= 4) (fail "GLF signature not found")
    nm <- endianRead4 LSB >>= iGetString . fromIntegral
    lift (per_file nm oit) >>= loop
  where
    -- loop :: Monad m => Iteratee a m b -> Iteratee S.ByteString m (Iteratee a m b)
    loop  it       = I.isFinished >>= loop' it
    loop' it  True = return it
    loop' it False = loop =<< enee_glf_seq per_seq it


-- | Enumerate the contents of a GLF file, apply suitable Enumeratees to
-- both sequences and records, resulting in an Enumerator of /whatever/,
-- typically output Strings or records...
--
-- This type is positively weird and I'm not entirely sure this is the
-- right way to go about it.
enum_glf_file :: (MonadIO m, MonadMask m)
              => FilePath
              -> (GlfSeq -> Enumeratee [GlfRec] a m b)
              -> (S.ByteString -> Enumerator a m b)
              -> Enumerator a m b
enum_glf_file fp per_seq per_file output =
    enumFile defaultBufSize fp >=> run $
    joinI $ decompressBgzf $
    enee_glf_file per_seq per_file output

enum_glf_handle :: (MonadIO m, MonadMask m)
                => Handle
                -> (GlfSeq -> Enumeratee [GlfRec] a m b)
                -> (S.ByteString -> Enumerator a m b)
                -> Enumerator a m b
enum_glf_handle hdl per_seq per_file output =
    enumHandle defaultBufSize hdl >=> run $
    joinI $ decompressBgzf $
    enee_glf_file per_seq per_file output