{-# LANGUAGE DeriveGeneric #-}

-- |
-- Module      :  ELynx.Character.Codon
-- Description :  Codons are triplets of nucleotides
-- Copyright   :  2021 Dominik Schrempf
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- Creation date: Thu May 16 07:58:50 2019.
--
-- The different universal codes.
-- - https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c
-- - http://www.bioinformatics.org/sms2/genetic_code.html
-- - https://en.wikipedia.org/wiki/Genetic_code
module ELynx.Character.Codon
  ( Codon (Codon),
    fromVecUnsafe,
    UniversalCode (..),
    translate,
    translateX,
    translateI,
  )
where

import Data.Aeson
  ( FromJSON,
    ToJSON,
  )
import Data.List
import qualified Data.Map as M
import qualified Data.Vector.Generic as V
import qualified ELynx.Character.AminoAcidI as AI
import ELynx.Character.AminoAcidS
import qualified ELynx.Character.Character as C
import qualified ELynx.Character.Nucleotide as N
import qualified ELynx.Character.NucleotideI as NI
import qualified ELynx.Character.NucleotideX as NX
import GHC.Generics (Generic)

-- | Codons are triplets of characters.
newtype Codon a = Codon (a, a, a)
  deriving (Int -> Codon a -> ShowS
forall a. Show a => Int -> Codon a -> ShowS
forall a. Show a => [Codon a] -> ShowS
forall a. Show a => Codon a -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Codon a] -> ShowS
$cshowList :: forall a. Show a => [Codon a] -> ShowS
show :: Codon a -> String
$cshow :: forall a. Show a => Codon a -> String
showsPrec :: Int -> Codon a -> ShowS
$cshowsPrec :: forall a. Show a => Int -> Codon a -> ShowS
Show, ReadPrec [Codon a]
ReadPrec (Codon a)
ReadS [Codon a]
forall a. Read a => ReadPrec [Codon a]
forall a. Read a => ReadPrec (Codon a)
forall a. Read a => Int -> ReadS (Codon a)
forall a. Read a => ReadS [Codon a]
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
readListPrec :: ReadPrec [Codon a]
$creadListPrec :: forall a. Read a => ReadPrec [Codon a]
readPrec :: ReadPrec (Codon a)
$creadPrec :: forall a. Read a => ReadPrec (Codon a)
readList :: ReadS [Codon a]
$creadList :: forall a. Read a => ReadS [Codon a]
readsPrec :: Int -> ReadS (Codon a)
$creadsPrec :: forall a. Read a => Int -> ReadS (Codon a)
Read, Codon a -> Codon a -> Bool
forall a. Eq a => Codon a -> Codon a -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Codon a -> Codon a -> Bool
$c/= :: forall a. Eq a => Codon a -> Codon a -> Bool
== :: Codon a -> Codon a -> Bool
$c== :: forall a. Eq a => Codon a -> Codon a -> Bool
Eq, Codon a -> Codon a -> Bool
Codon a -> Codon a -> Ordering
Codon a -> Codon a -> Codon a
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
forall {a}. Ord a => Eq (Codon a)
forall a. Ord a => Codon a -> Codon a -> Bool
forall a. Ord a => Codon a -> Codon a -> Ordering
forall a. Ord a => Codon a -> Codon a -> Codon a
min :: Codon a -> Codon a -> Codon a
$cmin :: forall a. Ord a => Codon a -> Codon a -> Codon a
max :: Codon a -> Codon a -> Codon a
$cmax :: forall a. Ord a => Codon a -> Codon a -> Codon a
>= :: Codon a -> Codon a -> Bool
$c>= :: forall a. Ord a => Codon a -> Codon a -> Bool
> :: Codon a -> Codon a -> Bool
$c> :: forall a. Ord a => Codon a -> Codon a -> Bool
<= :: Codon a -> Codon a -> Bool
$c<= :: forall a. Ord a => Codon a -> Codon a -> Bool
< :: Codon a -> Codon a -> Bool
$c< :: forall a. Ord a => Codon a -> Codon a -> Bool
compare :: Codon a -> Codon a -> Ordering
$ccompare :: forall a. Ord a => Codon a -> Codon a -> Ordering
Ord)

convert :: (C.Character a, C.Character b) => Codon a -> Codon b
convert :: forall a b. (Character a, Character b) => Codon a -> Codon b
convert (Codon (a
x, a
y, a
z)) = forall a. (a, a, a) -> Codon a
Codon (forall a b. (Character a, Character b) => a -> b
C.convert a
x, forall a b. (Character a, Character b) => a -> b
C.convert a
y, forall a b. (Character a, Character b) => a -> b
C.convert a
z)

-- | Unsafe conversion from vector with at least three elements; only the first
-- three elements are used, the rest is discarded.
fromVecUnsafe :: V.Vector v a => v a -> Codon a
fromVecUnsafe :: forall (v :: * -> *) a. Vector v a => v a -> Codon a
fromVecUnsafe v a
xs =
  forall a. (a, a, a) -> Codon a
Codon (forall (v :: * -> *) a. Vector v a => v a -> a
V.head v a
xs, forall (v :: * -> *) a. Vector v a => v a -> a
V.head forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (v :: * -> *) a. Vector v a => v a -> v a
V.tail forall a b. (a -> b) -> a -> b
$ v a
xs, forall (v :: * -> *) a. Vector v a => v a -> a
V.head forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (v :: * -> *) a. Vector v a => v a -> v a
V.tail forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (v :: * -> *) a. Vector v a => v a -> v a
V.tail forall a b. (a -> b) -> a -> b
$ v a
xs)

-- | Universal codes.
data UniversalCode = Standard | VertebrateMitochondrial
  deriving (Int -> UniversalCode -> ShowS
[UniversalCode] -> ShowS
UniversalCode -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [UniversalCode] -> ShowS
$cshowList :: [UniversalCode] -> ShowS
show :: UniversalCode -> String
$cshow :: UniversalCode -> String
showsPrec :: Int -> UniversalCode -> ShowS
$cshowsPrec :: Int -> UniversalCode -> ShowS
Show, ReadPrec [UniversalCode]
ReadPrec UniversalCode
Int -> ReadS UniversalCode
ReadS [UniversalCode]
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
readListPrec :: ReadPrec [UniversalCode]
$creadListPrec :: ReadPrec [UniversalCode]
readPrec :: ReadPrec UniversalCode
$creadPrec :: ReadPrec UniversalCode
readList :: ReadS [UniversalCode]
$creadList :: ReadS [UniversalCode]
readsPrec :: Int -> ReadS UniversalCode
$creadsPrec :: Int -> ReadS UniversalCode
Read, UniversalCode -> UniversalCode -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: UniversalCode -> UniversalCode -> Bool
$c/= :: UniversalCode -> UniversalCode -> Bool
== :: UniversalCode -> UniversalCode -> Bool
$c== :: UniversalCode -> UniversalCode -> Bool
Eq, Eq UniversalCode
UniversalCode -> UniversalCode -> Bool
UniversalCode -> UniversalCode -> Ordering
UniversalCode -> UniversalCode -> UniversalCode
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: UniversalCode -> UniversalCode -> UniversalCode
$cmin :: UniversalCode -> UniversalCode -> UniversalCode
max :: UniversalCode -> UniversalCode -> UniversalCode
$cmax :: UniversalCode -> UniversalCode -> UniversalCode
>= :: UniversalCode -> UniversalCode -> Bool
$c>= :: UniversalCode -> UniversalCode -> Bool
> :: UniversalCode -> UniversalCode -> Bool
$c> :: UniversalCode -> UniversalCode -> Bool
<= :: UniversalCode -> UniversalCode -> Bool
$c<= :: UniversalCode -> UniversalCode -> Bool
< :: UniversalCode -> UniversalCode -> Bool
$c< :: UniversalCode -> UniversalCode -> Bool
compare :: UniversalCode -> UniversalCode -> Ordering
$ccompare :: UniversalCode -> UniversalCode -> Ordering
Ord, Int -> UniversalCode
UniversalCode -> Int
UniversalCode -> [UniversalCode]
UniversalCode -> UniversalCode
UniversalCode -> UniversalCode -> [UniversalCode]
UniversalCode -> UniversalCode -> UniversalCode -> [UniversalCode]
forall a.
(a -> a)
-> (a -> a)
-> (Int -> a)
-> (a -> Int)
-> (a -> [a])
-> (a -> a -> [a])
-> (a -> a -> [a])
-> (a -> a -> a -> [a])
-> Enum a
enumFromThenTo :: UniversalCode -> UniversalCode -> UniversalCode -> [UniversalCode]
$cenumFromThenTo :: UniversalCode -> UniversalCode -> UniversalCode -> [UniversalCode]
enumFromTo :: UniversalCode -> UniversalCode -> [UniversalCode]
$cenumFromTo :: UniversalCode -> UniversalCode -> [UniversalCode]
enumFromThen :: UniversalCode -> UniversalCode -> [UniversalCode]
$cenumFromThen :: UniversalCode -> UniversalCode -> [UniversalCode]
enumFrom :: UniversalCode -> [UniversalCode]
$cenumFrom :: UniversalCode -> [UniversalCode]
fromEnum :: UniversalCode -> Int
$cfromEnum :: UniversalCode -> Int
toEnum :: Int -> UniversalCode
$ctoEnum :: Int -> UniversalCode
pred :: UniversalCode -> UniversalCode
$cpred :: UniversalCode -> UniversalCode
succ :: UniversalCode -> UniversalCode
$csucc :: UniversalCode -> UniversalCode
Enum, UniversalCode
forall a. a -> a -> Bounded a
maxBound :: UniversalCode
$cmaxBound :: UniversalCode
minBound :: UniversalCode
$cminBound :: UniversalCode
Bounded, forall x. Rep UniversalCode x -> UniversalCode
forall x. UniversalCode -> Rep UniversalCode x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep UniversalCode x -> UniversalCode
$cfrom :: forall x. UniversalCode -> Rep UniversalCode x
Generic)

instance FromJSON UniversalCode

instance ToJSON UniversalCode

-- It is important that the map is lazy, because some keys have errors as values.
mapFromLists :: Ord a => [a] -> [a] -> [a] -> [b] -> M.Map (Codon a) b
mapFromLists :: forall a b. Ord a => [a] -> [a] -> [a] -> [b] -> Map (Codon a) b
mapFromLists [a]
xs [a]
ys [a]
zs [b]
as =
  forall k a. Ord k => [(k, a)] -> Map k a
M.fromList forall a b. (a -> b) -> a -> b
$ forall a b c d e.
(a -> b -> c -> d -> e) -> [a] -> [b] -> [c] -> [d] -> [e]
zipWith4 (\a
f a
s a
t b
a -> (forall a. (a, a, a) -> Codon a
Codon (a
f, a
s, a
t), b
a)) [a]
xs [a]
ys [a]
zs [b]
as

nucs :: Enum a => [a]
nucs :: forall a. Enum a => [a]
nucs = forall a b. (a -> b) -> [a] -> [b]
map forall a. Enum a => Int -> a
toEnum [Int
3, Int
1, Int
0, Int
2] -- Order T, C, A , G.

-- Permutation of the triplets PLUS GAPS! I avoid 'Z' because I do not want to
-- translate DNAI.
base1, base2, base3 :: Enum a => [a]
base1 :: forall a. Enum a => [a]
base1 = [a
n | a
n <- forall a. Enum a => [a]
nucs, Int
_ <- [Int
0 .. Int
3 :: Int], Int
_ <- [Int
0 .. Int
3 :: Int]]
-- base1 = "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG" ++ "-."
base2 :: forall a. Enum a => [a]
base2 = [a
n | Int
_ <- [Int
0 .. Int
3 :: Int], a
n <- forall a. Enum a => [a]
nucs, Int
_ <- [Int
0 .. Int
3 :: Int]]
-- base2 = "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG" ++ "-."
base3 :: forall a. Enum a => [a]
base3 = [a
n | Int
_ <- [Int
0 .. Int
3 :: Int], Int
_ <- [Int
0 .. Int
3 :: Int], a
n <- forall a. Enum a => [a]
nucs]

-- base3 = "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" ++ "-."

-- The actual codes.
standard :: [AminoAcidS]
standard :: [AminoAcidS]
standard =
  [ AminoAcidS
F,
    AminoAcidS
F,
    AminoAcidS
L,
    AminoAcidS
L,
    AminoAcidS
S,
    AminoAcidS
S,
    AminoAcidS
S,
    AminoAcidS
S,
    AminoAcidS
Y,
    AminoAcidS
Y,
    AminoAcidS
Stop,
    AminoAcidS
Stop,
    AminoAcidS
C,
    AminoAcidS
C,
    AminoAcidS
Stop,
    AminoAcidS
W,
    AminoAcidS
L,
    AminoAcidS
L,
    AminoAcidS
L,
    AminoAcidS
L,
    AminoAcidS
P,
    AminoAcidS
P,
    AminoAcidS
P,
    AminoAcidS
P,
    AminoAcidS
H,
    AminoAcidS
H,
    AminoAcidS
Q,
    AminoAcidS
Q,
    AminoAcidS
R,
    AminoAcidS
R,
    AminoAcidS
R,
    AminoAcidS
R,
    AminoAcidS
I,
    AminoAcidS
I,
    AminoAcidS
I,
    AminoAcidS
M,
    AminoAcidS
T,
    AminoAcidS
T,
    AminoAcidS
T,
    AminoAcidS
T,
    AminoAcidS
N,
    AminoAcidS
N,
    AminoAcidS
K,
    AminoAcidS
K,
    AminoAcidS
S,
    AminoAcidS
S,
    AminoAcidS
R,
    AminoAcidS
R,
    AminoAcidS
V,
    AminoAcidS
V,
    AminoAcidS
V,
    AminoAcidS
V,
    AminoAcidS
A,
    AminoAcidS
A,
    AminoAcidS
A,
    AminoAcidS
A,
    AminoAcidS
D,
    AminoAcidS
D,
    AminoAcidS
E,
    AminoAcidS
E,
    AminoAcidS
G,
    AminoAcidS
G,
    AminoAcidS
G,
    AminoAcidS
G
  ]

-- "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG" ++ "--"

vertebrateMitochondrial :: [AminoAcidS]
vertebrateMitochondrial :: [AminoAcidS]
vertebrateMitochondrial =
  [ AminoAcidS
F,
    AminoAcidS
F,
    AminoAcidS
L,
    AminoAcidS
L,
    AminoAcidS
S,
    AminoAcidS
S,
    AminoAcidS
S,
    AminoAcidS
S,
    AminoAcidS
Y,
    AminoAcidS
Y,
    AminoAcidS
Stop,
    AminoAcidS
Stop,
    AminoAcidS
C,
    AminoAcidS
C,
    AminoAcidS
W,
    AminoAcidS
W,
    AminoAcidS
L,
    AminoAcidS
L,
    AminoAcidS
L,
    AminoAcidS
L,
    AminoAcidS
P,
    AminoAcidS
P,
    AminoAcidS
P,
    AminoAcidS
P,
    AminoAcidS
H,
    AminoAcidS
H,
    AminoAcidS
Q,
    AminoAcidS
Q,
    AminoAcidS
R,
    AminoAcidS
R,
    AminoAcidS
R,
    AminoAcidS
R,
    AminoAcidS
I,
    AminoAcidS
I,
    AminoAcidS
M,
    AminoAcidS
M,
    AminoAcidS
T,
    AminoAcidS
T,
    AminoAcidS
T,
    AminoAcidS
T,
    AminoAcidS
N,
    AminoAcidS
N,
    AminoAcidS
K,
    AminoAcidS
K,
    AminoAcidS
S,
    AminoAcidS
S,
    AminoAcidS
Stop,
    AminoAcidS
Stop,
    AminoAcidS
V,
    AminoAcidS
V,
    AminoAcidS
V,
    AminoAcidS
V,
    AminoAcidS
A,
    AminoAcidS
A,
    AminoAcidS
A,
    AminoAcidS
A,
    AminoAcidS
D,
    AminoAcidS
D,
    AminoAcidS
E,
    AminoAcidS
E,
    AminoAcidS
G,
    AminoAcidS
G,
    AminoAcidS
G,
    AminoAcidS
G
  ]

-- "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG" ++ "--"

-- | Translate a codon to amino acids including translation stops.
translate :: UniversalCode -> Codon N.Nucleotide -> AminoAcidS
translate :: UniversalCode -> Codon Nucleotide -> AminoAcidS
translate UniversalCode
code = forall k a. Ord k => Map k a -> k -> a
(M.!) (UniversalCode -> Map (Codon Nucleotide) AminoAcidS
universalCode UniversalCode
code)

-- | Translate a codon to amino acids including translation stops. Translate
-- codons including gaps to amino acid gaps. Be careful, single or two character
-- gaps can lead to a reading frame shift and hence, the translated sequence may
-- be bogus.
translateX :: UniversalCode -> Codon NX.NucleotideX -> AminoAcidS
-- translateX _ (Codon (NX.Gap, NX.Gap, NX.Gap)) = Gap
-- translateX code codon                         = C.convert . translate code . convert $ codon
translateX :: UniversalCode -> Codon NucleotideX -> AminoAcidS
translateX UniversalCode
code codon :: Codon NucleotideX
codon@(Codon (NucleotideX
x, NucleotideX
y, NucleotideX
z))
  | forall a. CharacterX a => a -> Bool
C.isGap NucleotideX
x Bool -> Bool -> Bool
|| forall a. CharacterX a => a -> Bool
C.isGap NucleotideX
y Bool -> Bool -> Bool
|| forall a. CharacterX a => a -> Bool
C.isGap NucleotideX
z = AminoAcidS
Gap
  | Bool
otherwise = forall a b. (Character a, Character b) => a -> b
C.convert forall b c a. (b -> c) -> (a -> b) -> a -> c
. UniversalCode -> Codon Nucleotide -> AminoAcidS
translate UniversalCode
code forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (Character a, Character b) => Codon a -> Codon b
convert forall a b. (a -> b) -> a -> b
$ Codon NucleotideX
codon

-- | Translate a codon to amino acids including translation stops. Translate gap
-- triplets to amino acid gaps, and triplets including unknowns to amino acid
-- unknowns. Be careful, also translates other IUPAC characters to amino acid Xs!
translateI :: UniversalCode -> Codon NI.NucleotideI -> AI.AminoAcidI
translateI :: UniversalCode -> Codon NucleotideI -> AminoAcidI
translateI UniversalCode
code codon :: Codon NucleotideI
codon@(Codon (NucleotideI
x, NucleotideI
y, NucleotideI
z))
  | forall a. CharacterI a => a -> Bool
C.isIUPAC NucleotideI
x Bool -> Bool -> Bool
|| forall a. CharacterI a => a -> Bool
C.isIUPAC NucleotideI
y Bool -> Bool -> Bool
|| forall a. CharacterI a => a -> Bool
C.isIUPAC NucleotideI
z = AminoAcidI
AI.X
  | Bool
otherwise = forall a b. (Character a, Character b) => a -> b
C.convert forall b c a. (b -> c) -> (a -> b) -> a -> c
. UniversalCode -> Codon NucleotideX -> AminoAcidS
translateX UniversalCode
code forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (Character a, Character b) => Codon a -> Codon b
convert forall a b. (a -> b) -> a -> b
$ Codon NucleotideI
codon

-- translateI :: UniversalCode -> Codon NI.NucleotideI -> AI.AminoAcidI
-- translateI _ (Codon (NI.N, _,    _   )) = AI.X
-- translateI _ (Codon (_   , NI.N, _   )) = AI.X
-- translateI _ (Codon (_,    _,    NI.N)) = AI.X
-- translateI code codon                   = C.convert . translateX code . convert $ codon

-- Map from 'Codon' to amino acid character.
universalCode :: UniversalCode -> M.Map (Codon N.Nucleotide) AminoAcidS
universalCode :: UniversalCode -> Map (Codon Nucleotide) AminoAcidS
universalCode UniversalCode
Standard = forall a b. Ord a => [a] -> [a] -> [a] -> [b] -> Map (Codon a) b
mapFromLists forall a. Enum a => [a]
base1 forall a. Enum a => [a]
base2 forall a. Enum a => [a]
base3 [AminoAcidS]
standard
universalCode UniversalCode
VertebrateMitochondrial =
  forall a b. Ord a => [a] -> [a] -> [a] -> [b] -> Map (Codon a) b
mapFromLists forall a. Enum a => [a]
base1 forall a. Enum a => [a]
base2 forall a. Enum a => [a]
base3 [AminoAcidS]
vertebrateMitochondrial