module Bio.Genocall.AvroFile where
import Bio.Base
import Bio.Bam.Header
import Bio.Bam.Pileup
import Control.Applicative
import Data.Aeson
import Data.Avro
import Data.Binary.Builder
import Data.Binary.Get
import Data.List ( intersperse )
import Data.MiniFloat
import Data.Scientific ( toBoundedInteger )
import Data.Text.Encoding ( encodeUtf8 )
import qualified Data.ByteString as B
import qualified Data.HashMap.Strict as H
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed as U
import qualified Data.Sequence as Z
data GenoCallBlock = GenoCallBlock
{ reference_name :: !Refseq
, start_position :: !Int
, called_sites :: [ GenoCallSite ] }
deriving (Show, Eq)
data GenoCallSite = GenoCallSite
{ snp_stats :: !CallStats
, snp_likelihoods :: !(U.Vector Mini)
, ref_allele :: !Nucleotides
, indel_stats :: !CallStats
, indel_variants :: [ IndelVariant ]
, indel_likelihoods :: !(U.Vector Mini) }
deriving (Show, Eq)
compact_likelihoods :: U.Vector Prob -> U.Vector Mini
compact_likelihoods = U.map $ float2mini . negate . unPr
deriveAvros [ ''GenoCallBlock, ''GenoCallSite, ''CallStats, ''IndelVariant ]
instance Avro V_Nuc where
toSchema _ = return $ object [ "type" .= String "bytes", "doc" .= String doc ]
where doc = T.pack $ intersperse ',' $ show $ [minBound .. maxBound :: Nucleotide]
toBin (V_Nuc v) = encodeIntBase128 (U.length v) <> U.foldr ((<>) . singleton . unN) mempty v
fromBin = decodeIntBase128 >>= \l -> V_Nuc . U.fromListN l . map N . B.unpack <$> getByteString l
toAvron (V_Nuc v) = String . T.pack . map w2c . U.toList $ U.map unN v
instance Avro V_Nucs where
toSchema _ = return $ object [ "type" .= String "bytes", "doc" .= String doc ]
where doc = T.pack $ intersperse ',' $ show $ [minBound .. maxBound :: Nucleotides]
toBin (V_Nucs v) = encodeIntBase128 (U.length v) <> U.foldr ((<>) . singleton . unNs) mempty v
fromBin = decodeIntBase128 >>= \l -> V_Nucs . U.fromListN l . map Ns . B.unpack <$> getByteString l
toAvron (V_Nucs v) = String . T.pack . map w2c . U.toList $ U.map unNs v
instance Avro Nucleotides where
toSchema _ = return $ String "int"
toBin = encodeIntBase128 . unNs
fromBin = Ns <$> decodeIntBase128
toAvron = Number . fromIntegral . unNs
instance Avro Mini where
toSchema _ = return $ String "int"
toBin = encodeIntBase128 . unMini
fromBin = Mini <$> decodeIntBase128
toAvron = Number . fromIntegral . unMini
instance Avro Refseq where
toSchema _ = getNamedSchema "Refseq"
toBin = encodeIntBase128 . unRefseq
fromBin = Refseq <$> decodeIntBase128
toAvron = Number . fromIntegral . unRefseq
getRefseqs :: AvroMeta -> Refs
getRefseqs meta
| Object o <- findSchema "Refseq" meta
, Just (String "enum") <- H.lookup "type" o
, Just (Array syms) <- H.lookup "symbols" o
= Z.fromList [ BamSQ (encodeUtf8 nm) ln [] | (String nm, ln) <- V.toList syms `zip` lengths ]
| otherwise = Z.empty
where
lengths = case decodeStrict =<< H.lookup "biohazard.refseq_length" meta of
Just (Array lns) -> [ case l of Number n -> maybe 0 id $ toBoundedInteger n ; _ -> 0 | l <- V.toList lns ]
_ -> repeat 0