module Bio.GenbankTools (
extractSpecificFeatureSeqData,
extractSpecificFeatureSequence,
module Bio.GenbankData
) where
import Bio.GenbankData
import Control.Monad
import Data.List
import Data.Maybe
import Bio.Core.Sequence
import Data.Int
import Bio.Sequence.Fasta
import qualified Data.ByteString.Lazy.Char8 as L
extractSpecificFeatureSeqData :: Maybe String -> Genbank -> [SeqData]
extractSpecificFeatureSeqData specificFeature genbank = seqdatas
where currentFeatures = getCurrentFeatures specificFeature genbank
coordinates = map featureCoordinates currentFeatures
fullSequence = origin genbank
seqdatas = concatMap (extractSeqDataList fullSequence) coordinates
extractSpecificFeatureSequence :: Maybe String -> Genbank -> [Sequence]
extractSpecificFeatureSequence specificFeature genbank = sequences
where currentAccession = L.unpack (locus genbank)
currentFeatures = getCurrentFeatures specificFeature genbank
fields = [ x | x@(Field {}) <- concatMap attributes currentFeatures]
locusTags = map fieldValue (filter (\field -> fieldType field == L.pack "locus_tag") fields)
currentHeaders = map (\locus_tag -> L.pack (L.unpack locus_tag ++ " " ++ currentAccession)) locusTags
coordinates = map featureCoordinates currentFeatures
fullSequence = origin genbank
seqdata = concatMap (extractSeqDataList fullSequence) coordinates
sequences = map (\(header,seqdata) -> Seq (SeqLabel header) seqdata Nothing) $ zip currentHeaders seqdata
getCurrentFeatures :: Maybe String -> Genbank -> [Feature]
getCurrentFeatures specificFeature genbank
| isNothing specificFeature = features genbank
| otherwise = filter (\x -> featureType x == L.pack (fromJust specificFeature))(features genbank)
extractSeqDataList :: SeqData -> CoordinateSet -> [SeqData]
extractSeqDataList genbankSeq seqCoordinates
| isNothing (setType seqCoordinates) = [extractSeqData genbankSeq (head (setCoordinates seqCoordinates))]
| fromJust (setType seqCoordinates) == "join" = extractJoinSeqData genbankSeq seqCoordinates
| fromJust (setType seqCoordinates) == "order" = extractOrderSeqData genbankSeq seqCoordinates
extractJoinSeqData :: SeqData -> CoordinateSet -> [SeqData]
extractJoinSeqData genbankSeq seqCoordinates = joinSequence
where coordinateList = setCoordinates seqCoordinates
partialSequences = map (extractByteStringFromSeqData genbankSeq) coordinateList
joinSequence = [SeqData (L.concat partialSequences)]
extractOrderSeqData :: SeqData -> CoordinateSet -> [SeqData]
extractOrderSeqData fullSeq seqCoordinates = orderSequences
where coordinateList = setCoordinates seqCoordinates
orderSequences = map (extractSeqData fullSeq) coordinateList
extractSeqData :: SeqData -> Coordinates -> SeqData
extractSeqData fullSequence seqCoordinates
| complement seqCoordinates = SeqData (revcomplement' subsequence)
| otherwise = SeqData subsequence
where subsequence = extractByteStringFromSeqData fullSequence seqCoordinates
extractByteStringFromSeqData :: SeqData -> Coordinates -> L.ByteString
extractByteStringFromSeqData fullSequence seqCoordinates = substring
where endTruncatedSequence = L.take (fromIntegral (coordinatesFrom seqCoordinates + coordinatesTo seqCoordinates):: Int64) (unSD fullSequence)
substring = L.drop (fromIntegral (coordinatesFrom seqCoordinates) :: Int64) endTruncatedSequence
revcomplement' :: L.ByteString -> L.ByteString
revcomplement' = L.map complement' . L.reverse
complement' :: Char -> Char
complement' 'A' = 'T'
complement' 'T' = 'A'
complement' 'C' = 'G'
complement' 'G' = 'C'
complement' 'a' = 't'
complement' 't' = 'a'
complement' 'c' = 'g'
complement' 'g' = 'c'
complement' x = x