{-# LANGUAGE TemplateHaskell, QuasiQuotes #-} module Tests.Count ( tgroup_Count ) where import Test.Tasty.TH import Test.Tasty.HUnit import qualified Data.IntervalIntMap as IM import qualified Data.Set as S import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as B8 import qualified Data.Vector as V import qualified Data.Vector.Unboxed as VU import qualified Data.Map as M import qualified Data.Conduit.Combinators as C import qualified Data.Conduit.Binary as CB import qualified Data.Conduit.List as CL import qualified Data.Conduit as C import Data.Conduit ((.|)) import Control.Monad.IO.Class (liftIO) import Data.Maybe import Interpretation.Count import qualified Interpretation.Count.RefSeqInfoVector as RSV import FileOrStream (FileOrStream(..)) import Tests.Utils import Utils.Here import NGLess tgroup_Count = $(testGroupGenerator) readCountFile :: FilePath -> IO (M.Map B.ByteString Double) readCountFile fp = C.runConduitRes $ C.sourceFile fp .| CB.lines .| (C.await >> (C.awaitForever C.yield)) -- skip first line .| CL.foldMap parseLine where parseLine line = case B8.split '\t' line of [h,val] -> M.singleton h (read $ B8.unpack val) _ -> error ("Could not parse line: " ++ show line) runSamGffAnnotation:: B.ByteString -> B.ByteString -> CountOpts -> NGLessIO (M.Map B.ByteString Double) runSamGffAnnotation sam_content gff_content opts = do sam_fp <- asTempFile sam_content "sam" gff_fp <- asTempFile gff_content "gff" ann <- loadAnnotator (AnnotateGFF gff_fp) opts p <- performCount (File sam_fp) "testing" ann opts liftIO $ readCountFile p listNub :: (Ord a) => [a] -> [a] listNub = S.toList . S.fromList defCountOpts = CountOpts { optFeatures = [] , optSubFeatures = Nothing , optIntersectMode = annotationRule IntersectUnion , optAnnotationMode = AnnotateSeqName , optStrandMode = SMBoth , optMinCount = 0.0 , optMMMethod = MMUniqueOnly , optDelim = "\t" , optNormMode = NMRaw , optIncludeMinus1 = False } extractIds :: [AnnotationInfo] -> [Int] extractIds = map (\(AnnotationInfo _ ix) -> ix) very_short_gff = "test_samples/very_short.gtf" case_load_very_short = do [GFFAnnotator immap headers szmap] <- testNGLessIO $ loadAnnotator (AnnotateGFF very_short_gff) defCountOpts { optFeatures = ["gene"] } let usedIDs = extractIds $ concatMap IM.elems $ M.elems immap length (listNub usedIDs) @?= V.length headers minimum usedIDs @?= 0 maximum usedIDs @?= V.length headers - 1 VU.length szmap @?= V.length headers let mix = do ix <- V.elemIndex "WBGene00010199" headers return $! szmap VU.! ix mix @?= Just (721-119+1) short3 :: B.ByteString short3 = [here| V protein_coding gene 7322 8892 . - . gene_id "WBGene00008825"; gene_name "F14H3.6"; gene_source "ensembl"; gene_biotype "protein_coding"; X protein_coding gene 140 218 . + . gene_id "WBGene00020330"; gene_name "T07H6.1"; gene_source "ensembl"; gene_biotype "protein_coding"; X protein_coding gene 632 733 . + . gene_id "WBGene00000526"; gene_name "clc-5"; gene_source "ensembl"; gene_biotype "protein_coding"; |] -- this is a regression test case_load_gff_order = do fp <- testNGLessIO $ asTempFile short3 "gtf" [GFFAnnotator immap headers _] <- testNGLessIO $ loadAnnotator (AnnotateGFF fp) defCountOpts { optFeatures = ["gene"] } let [h] = extractIds . IM.elems . fromJust $ M.lookup "V" immap (headers V.! h) @?= "WBGene00008825" short1 :: B.ByteString short1 = [here| X protein_coding gene 610 1473 . + . gene_id "WBGene00002254"; gene_name "lbp-2"; gene_source "ensembl"; gene_biotype "protein_coding"; |] short_sam :: B.ByteString short_sam = [here| @SQ SN:X LN:18942 SRR070372.1096 0 X 1174 60 62S75M1D37M46D58M10S * 0 0 GTTCTACAACGTCCAGATCGGAAGCAAGTTCGAAGGAGAGGGTCTTGATAACACCAAGCACGAGGTTACCTTCACTCTCAAGGACGGACACTTGTTCGAACATCACAAGCCACTTGAAGAGGGAGAATCCAAGGAAGAACCTATGAGTATTACTTTGATGGAGATTTTCTTATTCAGAAGATGAGCTTCAACAATATCGAAGGCCGCAGATTCTACAAGAGACTCCCATAAAGTTAACTATC IIIIIIF@@@CIIIIIIIIIIIIIIIIIIIIIHHIIIB=5669CIIIIIIIIIIIIIIIIIIIIIIIIIIHHHIHIIIIIIIIIIIIIIIIIIIHIIIIIIIIIIIIIIIHIH>>>FIIGBB@E??;75444<<:62///1>?BAAAD?AE;72217