module Biobase.SubstMatrix where import Control.DeepSeq (NFData(..)) import Data.Aeson (FromJSON,ToJSON) import Data.Binary (Binary) import Data.Serialize (Serialize) import Data.Vector.Unboxed.Deriving import GHC.Generics (Generic) import qualified Data.Map.Strict as M import qualified Data.Vector.Unboxed as VU import Biobase.Primary.AA (AA,aaRange) import Biobase.Primary.Letter import Biobase.Primary.Nuc.DNA (DNA) import Biobase.Primary.Trans (dnaAAmap) import Biobase.Types.Odds import Data.PrimitiveArray import qualified Biobase.Primary.AA as AA import qualified Biobase.Primary.Nuc.DNA as D -- | Denotes that we are dealing with a similarity score. Higher is more -- similar. data Similarity -- | Denotes that we are dealing with a distance score. Lower is more -- similar. data Distance -- An amino-acid substitution matrix. Tagged with the type of scoring used. newtype AASubstMat t = AASubstMat { aaSubstMat :: Unboxed (Z:.Letter AA:.Letter AA) DLO } deriving (Generic,Eq,Read,Show) instance Binary (AASubstMat t) instance Serialize (AASubstMat t) --instance FromJSON (AASubstMat t) --instance ToJSON (AASubstMat t) instance NFData (AASubstMat t) -- | @PAM@ matrices are similarity matrices. type SubstPAM = AASubstMat Similarity -- | @BLOSUM@ matrices are distance matrices. type SubstBLOSUM = AASubstMat Distance -- | Substitution matrix from amino acids to nucleotide triplets. newtype ANuc3SubstMat t = ANuc3SubstMat { anuc3SubstMat :: Unboxed (Z:.Letter AA:.Letter DNA:.Letter DNA:.Letter DNA) DLO } deriving (Generic,Eq,Read,Show) instance Binary (ANuc3SubstMat t) instance Serialize (ANuc3SubstMat t) --instance FromJSON (ANuc3SubstMat t) --instance ToJSON (ANuc3SubstMat t) instance NFData (ANuc3SubstMat t) -- | Substitution matrix from amino acids to degenerate nucleotide -- 2-tuples. The third nucleotide letter is missing. newtype ANuc2SubstMat t = ANuc2SubstMat { anuc2SubstMat :: Unboxed (Z:.Letter AA:.Letter DNA:.Letter DNA) DLO } deriving (Generic,Eq,Read,Show) instance Binary (ANuc2SubstMat t) instance Serialize (ANuc2SubstMat t) --instance FromJSON (ANuc2SubstMat t) --instance ToJSON (ANuc2SubstMat t) instance NFData (ANuc2SubstMat t) -- | Substitution matrix from amino acids to degenerate nucleotide -- 1-tuples. Two out of three nucleotides in a triplet are missing. newtype ANuc1SubstMat t = ANuc1SubstMat { anuc1SubstMat :: Unboxed (Z:.Letter AA:.Letter DNA) DLO } deriving (Generic,Eq,Read,Show) instance Binary (ANuc1SubstMat t) instance Serialize (ANuc1SubstMat t) --instance FromJSON (ANuc1SubstMat t) --instance ToJSON (ANuc1SubstMat t) instance NFData (ANuc1SubstMat t) -- | The usual substitution matrix, but here with a codon and an amino acid -- to be compared. mkANuc3SubstMat :: AASubstMat t -> ANuc3SubstMat t mkANuc3SubstMat (AASubstMat m) = ANuc3SubstMat $ fromAssocs (Z:. AA.Stop :. D.A:.D.A:.D.A) (Z:. AA.Z :. D.N:.D.N:.D.N) (DLO $ -999) [ ( (Z:.a:.u:.v:.w) , maybe (DLO $ -999) (\b -> m!(Z:.a:.b)) $ M.lookup uvw dnaAAmap) | a <- aaRange , u <- [D.A .. D.N], v <- [D.A .. D.N], w <- [D.A .. D.N] , let uvw = VU.fromList [u,v,w] ] -- | Create a 2-tuple to amino acid substitution matrix. Here, @f@ combines -- all to entries that have the same 2-tuple index. mkANuc2SubstMat :: (DLO -> DLO -> DLO) -> AASubstMat t -> ANuc2SubstMat t mkANuc2SubstMat f (AASubstMat m) = ANuc2SubstMat $ fromAssocs (Z:. AA.Stop :. D.A:.D.A) (Z:. AA.Z :. D.N:.D.N) (DLO $ -999) . M.assocs . M.fromListWith f $ [ ((Z:.a:.x:.y), maybe (DLO $ -999) (\k -> m!(Z:.a:.k)) $ M.lookup uvw dnaAAmap) | a <- aaRange , u <- [D.A .. D.N], v <- [D.A .. D.N], w <- [D.A .. D.N] , (x,y) <- [ (u,v), (u,w), (v,w) ] , let uvw = VU.fromList [u,v,w] ] -- | The most degenerate case, where just a single nucleotide remains in -- the amino-acid / nucleotide substitution. Again, @f@ combines different -- entries. mkANuc1SubstMat :: (DLO -> DLO -> DLO) -> AASubstMat t -> ANuc1SubstMat t mkANuc1SubstMat f (AASubstMat m) = ANuc1SubstMat $ fromAssocs (Z:. AA.Stop :. D.A) (Z:. AA.Z :. D.N) (DLO $ -999) . M.assocs . M.fromListWith f $ [ ((Z:.a:.x), maybe (DLO $ -999) (\k -> m!(Z:.a:.k)) $ M.lookup uvw dnaAAmap) | a <- aaRange , u <- [D.A .. D.N], v <- [D.A .. D.N], w <- [D.A .. D.N] , x <- [u,v,w] , let uvw = VU.fromList [u,v,w] ]