-- | Parser for @FastA/FastQ@, 'ByteStream' style, written such that it
-- works well with module "Bio.Bam".
--
-- Input streams are broken into numbered lines, then into records.
-- Records can start with empty lines, which are ignored, or random
-- junk, which is ignored, but results in a warning, followed by a
-- header indicating either a @FastA@ (begins with @\>@ or @;@) or
-- @FastQ@ record (begins with @\@@).  More description lines begining
-- with @;@ are allowed, and silently ignored.  All following lines not
-- starting with @+@, @\>@, @;@ or @\@@ are sequence lines.  (Only) in a
-- @FastQ@ record, this is followed by a separator line starting with a
-- @+@, which is ignored, and exactly as many quality lines as there
-- were sequence lines.  A missing separator results in a warning and
-- the record being parsed without quality scores.
--
-- In sequence lines, IUPAC-IUB ambiguity codes are converted to
-- 'Nucleotides', white space is skipped silently.  Any other character
-- becomes an unknown base ('=' in SAM) and a warning is emitted.  Note
-- that downstream tools are unlikely to handle the resulting unknown
-- bases and/or empty records gracefully.  If the quality lines do not
-- have the same total length as the sequence lines (this includes
-- missing quality lines due to end-of-stream), a warning is emitted,
-- and the record receives no quality scores (just as if it was a
-- @FastA@ record).  Else, if the quality lines have a different layout
-- than the sequence lines, a warning is emitted, but they are still
-- used.
--
-- Quality scores must be stored as raw bytes with offset 33.  (Other
-- variants, like 454's ASCII qualities and Solexa's raw bytes with
-- offset 64 are difficult to detect, and extinct in the wild anyway.)
-- If the second word of the header stores multiple fields, we try to
-- extract Illumina's \"QC failed\" flag and either an index sequence or
-- a read group name from it.
--
-- Other flags are commonly encoded into the sequence names.  We do not
-- handle those here, but most of the conventions at MPI EVAN are dealt
-- with by 'Bio.Bam.Evan.removeWarts'.

module Bio.Bam.Fastq
    ( parseFastq
    , EmptyRecord(..)
    , IncoherentQualities(..)
    , IncongruentQualities(..)
    , JunkFound(..)
    , QualitiesMissing(..)
    , SequenceHasGaps(..)
    ) where

import Bio.Bam.Header
import Bio.Bam.Rec
import Bio.Prelude
import Bio.Streaming
import Bio.Streaming.Bytes ( lines' )

import qualified Data.ByteString                    as B
import qualified Data.ByteString.Char8              as C
import qualified Data.ByteString.Unsafe             as B
import qualified Data.Vector.Storable               as W
import qualified Streaming.Prelude                  as Q

-- | Emitted when random text is found instead of a header.
data JunkFound = JunkFound !Int !Bytes deriving (Typeable, Show)

instance Exception JunkFound where
    displayException (JunkFound n s) = printf "junk found at line %d: %s" n (unpack s)

-- | Emitted when a quality separator was expected, but not found.
data QualitiesMissing = QualitiesMissing !Int !Bytes deriving (Typeable, Show)

instance Exception QualitiesMissing where
    displayException (QualitiesMissing 0 _) = printf "expected '+' symbol at end of file"
    displayException (QualitiesMissing n s) = printf "expected '+' symbol at line %d, but found %s" n (unpack s)

-- | Emitted when a quality record does not fit the sequence record.
data IncoherentQualities = IncoherentQualities !Int !Bytes deriving (Typeable, Show)

instance Exception IncoherentQualities where
    displayException (IncoherentQualities n s) = printf "quality record of incorrect length ignored at line %d (%s)" n (unpack s)

-- | Emitted when a quality record has different layout than the
-- sequence.
data IncongruentQualities = IncongruentQualities !Int !Bytes deriving (Typeable, Show)

instance Exception IncongruentQualities where
    displayException (IncongruentQualities n s) = printf "quality and sequence have different layouts at line %d (%s)" n (unpack s)

-- | Emitted when a sequence record contains strange characters
data SequenceHasGaps = SequenceHasGaps !Int !Bytes deriving (Typeable, Show)

instance Exception SequenceHasGaps where
    displayException (SequenceHasGaps n cs) = printf "undefined characters %s stored as unknown bases at line %d" (show cs) n

data EmptyRecord = EmptyRecord !Int !Bytes deriving (Typeable, Show)

instance Exception EmptyRecord where
    displayException (EmptyRecord n s) = printf "(effectively) empty record at line %d (%s)" n (unpack s)


{-# INLINE parseFastq #-}
parseFastq :: MonadLog m => ByteStream m r -> Stream (Of BamRec) m r
parseFastq = Q.unfoldr go . Q.zip (Q.enumFrom (1::Int)) . lines'
  where
    go = inspect >=> \case
        Left r                                    ->  return (Left r)
        Right ((i,h) :> ls)
            | B.null h                            ->  go ls
            | C.head h == '>' || C.head h == ';'  ->  goFasta (i, B.tail h) (Q.dropWhile isDescr ls)
            | C.head h == '@'                     ->  goFastq (i, B.tail h) (Q.dropWhile isDescr ls)
            | otherwise                           ->  logMsg Warning (JunkFound i h) >> go ls

    isDescr  (_,s) = not (B.null s) &&  C.head s == ';'
    isHeader (_,s) = not (B.null s) && (C.head s == '>' || C.head s == '@' || C.head s == ';')
    isSep    (_,s) = not (B.null s) &&  C.head s == '+'
    isSeq x = not (isHeader x || isSep x)

    goFasta h ls = do
        sq :> ls' <- Q.toList $ Q.span isSeq ls
        make_record h sq Nothing ls'

    goFastq h ls = do
        sq :> ls1  <- Q.toList $ Q.span isSeq ls
        Q.next ls1 >>= \case
            Right (sep, ls2)
                | isSep sep -> do
                    qs :> ls3 <- Q.toList $ Q.splitAt (length sq) ls2
                    if sum (map (B.length . snd) qs) /= sum (map (B.length . snd) sq) then do
                        logMsg Error $ uncurry IncoherentQualities  h
                        make_record h sq Nothing ls3
                      else do
                        when (map (B.length . snd) qs /= map (B.length . snd) sq) $
                            logMsg Warning $ uncurry IncongruentQualities h
                        make_record h sq (Just qs) ls3
                | otherwise -> do
                    logMsg Error $ uncurry QualitiesMissing sep
                    make_record h sq Nothing (Q.cons sep ls2)
            Left x -> do
                    logMsg Error $ QualitiesMissing 0 C.empty
                    make_record h sq Nothing (pure x)

    make_record h sq qs k = do
        when (ngaps > 0) . logMsg Warning $
            SequenceHasGaps (fst $ head sq) (B.filter (isGap . toNucleotides) $ B.concat $ map snd sq)
        when (l == 0) . logMsg Warning $ uncurry EmptyRecord h
        return $ Right (r,k)
      where
        !l     = sum $ map (B.length . snd) sq
        (!nseq, !ngaps) = mkSeq l sq
        !qual  = case qs of Nothing -> Nothing
                            Just [] -> Nothing
                            Just  q -> Just $! mkQual l q

        (!qname, !descr) = B.break (== 32) (snd h)
        !fflag = B.drop 1 . C.dropWhile (/= ':') $ descr

        !r = if B.length fflag < 2 || C.index fflag 1 /= ':' || (C.head fflag /= 'Y' && C.head fflag /= 'N')
             then nullBamRec { b_qname = qname, b_seq = nseq, b_qual = qual }
             else let !flag | C.head fflag /= 'Y' = b_flag nullBamRec .|. flagFailsQC
                            | otherwise           = b_flag nullBamRec

                      !sample = B.takeWhile (/=32). B.drop 1. C.dropWhile (/=':'). B.drop 1. C.dropWhile (/=':') $ fflag
                      !exts | B.null sample                    =  [                   ]
                            | C.all (`C.elem` "ACGTN") sample  =  [("XI", Text sample)]
                            | otherwise                        =  [("RG", Text sample)]
                  in nullBamRec { b_qname = qname, b_seq = nseq, b_qual = qual, b_flag = flag, b_exts = exts }


mkSeq :: Int -> [(Int,Bytes)] -> (Vector_Nucs_half Nucleotides, Int)
mkSeq ltot xs0 = unsafePerformIO $ do
    fp <- mallocForeignPtrBytes (shiftR (ltot+1) 1)
    g  <- withForeignPtr fp $ \p -> go_even p 0 xs0
    return ( Vector_Nucs_half 0 ltot fp, g )
  where
    go_even !p !g (s:ss)  =  B.unsafeUseAsCStringLen (snd s) $ \(q,l) -> go1_even ss p q g l
    go_even  _ !g [    ]  =  return g

    go_odd !p !g !a (s:ss)  =  B.unsafeUseAsCStringLen (snd s) $ \(q,l) -> go1_odd a ss p q g l
    go_odd !p !g !a [    ]  =  poke p a >> return g

    go1_odd a ss !p !q !g !l
        | l > 0      =  do !b <- unNs . toNucleotides <$> peekByteOff q 0
                           poke p $ a `shiftL` 4 .|. b
                           go1_even ss (plusPtr p 1) (plusPtr q 1) (g + fromEnum (b == 0)) (l-1)
        | otherwise  =  go_odd p g a ss

    go1_even ss !p !q !g !l
        | l > 1      =  do !a <- unNs . toNucleotides <$> peekByteOff q 0
                           !b <- unNs . toNucleotides <$> peekByteOff q 1
                           poke p $ a `shiftL` 4 .|. b
                           go1_even ss (plusPtr p 1) (plusPtr q 2) (g + fromEnum (a == 0) + fromEnum (b == 0)) (l-2)
        | l > 0      =  do !a <- unNs . toNucleotides <$> peekByteOff q 0
                           go_odd p (g + fromEnum (a == 0)) (a `shiftL` 4) ss
        | otherwise  =  go_even p g ss


mkQual :: Int -> [(Int, Bytes)] -> W.Vector Qual
mkQual ltot qs0 = unsafePerformIO $ do
    fp <- mallocForeignPtrBytes ltot
    withForeignPtr fp $ \p -> go (castPtr p) qs0
    return $! W.unsafeFromForeignPtr0 fp ltot
  where
    go !p (s:ss)  =  B.unsafeUseAsCStringLen (snd s) $ \(q,l) -> go1 ss p (castPtr q) l
    go  _ [    ]  =  return ()

    go1 ss !p !q !l
        | l > 0      =  do peek q >>= poke p . subtract (33::Word8)
                           go1 ss (plusPtr p 1) (plusPtr q 1) (pred l)
        | otherwise  =  go p ss