{-# LANGUAGE TemplateHaskell, OverloadedStrings #-}
module Bio.Genocall.AvroFile where

import Bio.Base
import Bio.Bam.Pileup
import Data.Aeson
import Data.Avro hiding ((.=))
import Data.Binary.Builder
import Data.Binary.Get
import Data.Monoid

import qualified Data.ByteString                as B
import qualified Data.Text                      as T
import qualified Data.Vector.Unboxed            as U

-- ^ File format for genotype calls.

-- | To output a container file, we need to convert calls into a stream of
-- sensible objects.  To cut down on redundancy, the object will have a
-- header that names the reference sequence and the start, followed by
-- calls.  The calls themselves have contiguous coordinates, we start a
-- new block if we have to skip; we also start a new block when we feel
-- the current one is getting too large.

data GenoCallBlock = GenoCallBlock
    { reference_name :: T.Text
    , start_position :: Int
    , called_sites :: [ GenoCallSite ] }

data GenoCallSite = GenoCallSite
    { snp_stats         :: CallStats
    , snp_likelihoods   :: [ Int ] -- B.ByteString
    , indel_stats       :: CallStats
    , indel_variants    :: [ IndelVariant ]
    , indel_likelihoods :: [ Int ] -- B.ByteString
    }

$( deriveAvros [ ''GenoCallBlock, ''GenoCallSite, ''CallStats, ''IndelVariant ] )

instance Avro V_Nuc where
    toSchema        _ = return $ object [ "type" .= String "bytes", "doc" .= String "A,C,G,T" ]
    toBin   (V_Nuc v) = encodeIntBase128 (U.length v) <> U.foldr ((<>) . singleton . unN) mempty v
    fromBin           = decodeIntBase128 >>= fmap (V_Nuc . U.fromList . map N . B.unpack) . getByteString
    toAvron (V_Nuc v) = String . T.pack . map w2c . U.toList $ U.map unN v