Safe Haskell | None |
---|---|

Language | Haskell98 |

Common data types used everywhere. This module is a collection of very basic "bioinformatics" data types that are simple, but don't make sense to define over and over.

- newtype Nucleotide = N {}
- newtype Nucleotides = Ns {}
- newtype Qual = Q {}
- toQual :: (Floating a, RealFrac a) => a -> Qual
- fromQual :: Qual -> Double
- fromQualRaised :: Double -> Qual -> Double
- probToQual :: Prob -> Qual
- newtype Prob = Pr {}
- toProb :: Double -> Prob
- fromProb :: Prob -> Double
- qualToProb :: Qual -> Prob
- pow :: Prob -> Double -> Prob
- data Word8 :: *
- nucA :: Nucleotide
- nucC :: Nucleotide
- nucG :: Nucleotide
- nucT :: Nucleotide
- nucsA :: Nucleotides
- nucsC :: Nucleotides
- nucsG :: Nucleotides
- nucsT :: Nucleotides
- nucsN :: Nucleotides
- gap :: Nucleotides
- toNucleotide :: Char -> Nucleotide
- toNucleotides :: Char -> Nucleotides
- nucToNucs :: Nucleotide -> Nucleotides
- showNucleotide :: Nucleotide -> Char
- showNucleotides :: Nucleotides -> Char
- isGap :: Nucleotides -> Bool
- isBase :: Nucleotides -> Bool
- isProperBase :: Nucleotides -> Bool
- properBases :: [Nucleotides]
- compl :: Nucleotide -> Nucleotide
- compls :: Nucleotides -> Nucleotides
- everything :: (Bounded a, Ix a) => [a]
- type Seqid = ByteString
- unpackSeqid :: Seqid -> String
- packSeqid :: String -> Seqid
- data Position = Pos {}
- shiftPosition :: Int -> Position -> Position
- p_is_reverse :: Position -> Bool
- data Range = Range {}
- shiftRange :: Int -> Range -> Range
- reverseRange :: Range -> Range
- extendRange :: Int -> Range -> Range
- insideRange :: Range -> Range -> Range
- wrapRange :: Int -> Range -> Range
- w2c :: Word8 -> Char
- c2w :: Char -> Word8
- findAuxFile :: FilePath -> IO FilePath

# Documentation

newtype Nucleotide Source

A nucleotide base. We only represent A,C,G,T.

newtype Nucleotides Source

A nucleotide base in an alignment. Experience says we're dealing with Ns and gaps all the type, so purity be damned, they are included as if they were real bases.

To allow `Nucleotides`

s to be unpacked and incorparated into
containers, we choose to represent them the same way as the BAM file
format: as a 4 bit wide field. Gaps are encoded as 0 where they
make sense, N is 15.

Qualities are stored in deciban, also known as the Phred scale. To
represent a value `p`

, we store `-10 * log_10 p`

. Operations work
directly on the "Phred" value, as the name suggests. The same goes
for the `Ord`

instance: greater quality means higher "Phred"
score, meand lower error probability.

fromQualRaised :: Double -> Qual -> Double Source

probToQual :: Prob -> Qual Source

qualToProb :: Qual -> Prob Source

data Word8 :: *

8-bit unsigned integer type

Bounded Word8 | |

Enum Word8 | |

Eq Word8 | |

Integral Word8 | |

Num Word8 | |

Ord Word8 | |

Read Word8 | |

Real Word8 | |

Show Word8 | |

Ix Word8 | |

Storable Word8 | |

Bits Word8 | |

FiniteBits Word8 | |

Binary Word8 | |

Hashable Word8 | |

Prim Word8 | |

Random Word8 | |

Lift Word8 | |

Unbox Word8 | |

IArray UArray Word8 | |

ReadableChunk ByteString Word8 | |

ReadableChunk ByteString Word8 | |

Vector Vector Word8 | |

MVector MVector Word8 | |

ReadableChunk [Word8] Word8 | |

MArray (STUArray s) Word8 (ST s) | |

VecArrayRW ((:.) Word8 ()) | |

VecArrayRW ((:.) Word8 v) => VecArrayRW ((:.) Word8 ((:.) Word8 v)) | |

data Vector Word8 = V_Word8 (Vector Word8) | |

data MVector s Word8 = MV_Word8 (MVector s Word8) |

nucA :: Nucleotide Source

nucC :: Nucleotide Source

nucG :: Nucleotide Source

nucT :: Nucleotide Source

gap :: Nucleotides Source

toNucleotide :: Char -> Nucleotide Source

Converts a character into a `Nucleotides`

.
The usual codes for A,C,G,T and U are understood, `-`

and `.`

become
gaps and everything else is an N.

toNucleotides :: Char -> Nucleotides Source

Converts a character into a `Nucleotides`

.
The usual codes for A,C,G,T and U are understood, `-`

and `.`

become
gaps and everything else is an N.

nucToNucs :: Nucleotide -> Nucleotides Source

showNucleotide :: Nucleotide -> Char Source

showNucleotides :: Nucleotides -> Char Source

isGap :: Nucleotides -> Bool Source

Tests if a `Nucleotides`

is a gap.
Returns true only for the gap.

isBase :: Nucleotides -> Bool Source

Tests if a `Nucleotides`

is a base.
Returns `True`

for everything but gaps.

isProperBase :: Nucleotides -> Bool Source

Tests if a `Nucleotides`

is a proper base.
Returns `True`

for A,C,G,T only.

properBases :: [Nucleotides] Source

compl :: Nucleotide -> Nucleotide Source

Complements a Nucleotides.

compls :: Nucleotides -> Nucleotides Source

Complements a Nucleotides.

everything :: (Bounded a, Ix a) => [a] Source

type Seqid = ByteString Source

Sequence identifiers are ASCII strings
Since we tend to store them for a while, we use strict byte strings.
If you get a lazy bytestring from somewhere, use `shelve`

to convert
it for storage. Use `unpackSeqid`

and `packSeqid`

to avoid the
import of `Data.ByteString`

.

unpackSeqid :: Seqid -> String Source

Unpacks a `Seqid`

into a `String`

Coordinates in a genome. The position is zero-based, no questions about it. Think of the position as pointing to the crack between two bases: looking forward you see the next base to the right, looking in the reverse direction you see the complement of the first base to the left.

To encode the strand, we (virtually) reverse-complement any sequence and prepend it to the normal one. That way, reversed coordinates have a negative sign and automatically make sense. Position 0 could either be the beginning of the sequence or the end on the reverse strand... that ambiguity shouldn't really matter.

shiftPosition :: Int -> Position -> Position Source

Moves a `Position`

. The position is moved forward according to the
strand, negative indexes move backward accordingly.

p_is_reverse :: Position -> Bool Source

Ranges in genomes
We combine a position with a length. In 'Range pos len', `pos`

is
always the start of a stretch of length `len`

. Positions therefore
move in the opposite direction on the reverse strand. To get the
same stretch on the reverse strand, shift r_pos by r_length, then
reverse direction (or call reverseRange).

shiftRange :: Int -> Range -> Range Source

Moves a `Range`

. This is just `shiftPosition`

lifted.

reverseRange :: Range -> Range Source

Reverses a `Range`

to give the same `Range`

on the opposite strand.

extendRange :: Int -> Range -> Range Source

Extends a range. The length of the range is simply increased.

insideRange :: Range -> Range -> Range Source

Expands a subrange.
`(range1 `

interprets `insideRange`

range2)`range1`

as a subrange of
`range2`

and computes its absolute coordinates. The sequence name of
`range1`

is ignored.

wrapRange :: Int -> Range -> Range Source

Wraps a range to a region. This simply normalizes the start
position to be in the interval '[0,n)', which only makes sense if the
`Range`

is to be mapped onto a circular genome. This works on both
strands and the strand information is retained.

findAuxFile :: FilePath -> IO FilePath Source

Finds a file by searching the environment variable BIOHAZARD like a PATH.