-- clusterLibs -- calculate clusters by library, using the lib table -- read patterns from a table, first line is header import System.Environment (getArgs) import Text.Regex import Data.List (elemIndex,sortBy,intersperse) import Data.Map hiding (map) import Numeric (showFFloat) import Statistics type LibTable = [(Regex,String)] type Cluster = (String,[String]) main :: IO () main = do [ps,cs] <- do [xs,ys] <- getArgs return [xs,ys] `catch` error "Usage: clusterlibs " pat <- readPatternTable ps counts <- totalsByLib pat cs writeTable [("","sum":map snd pat++["significance"])] writeTable . map (decorate counts) . countClusters pat =<< readClusters cs -- counts is decorate :: (String,[Int]) -> (String,[Int]) -> (String,[String]) decorate counts (x,ys) = let fractions = [ fromIntegral x / fromIntegral (sum $ snd counts) | x <- snd counts] clustersize = sum ys significance = unwords $ zipWith isSignificant ys fractions isSignificant obs frac = showFFloat (Just 3) (pvalue (1-frac) clustersize (clustersize-obs)) "" in (x,map show (clustersize:ys) ++ [significance]) {- clus <- classClusters pat cs writeTable clus -} -- -------------------------------------------------- -- | Calculate the p value of observing k sequences -- from a library, given an a priori background distribution. pvalue :: Double -> Int -> Int -> Double pvalue fraction tot obs = cumbin fraction tot obs -- -------------------------------------------------- -- | Read a LibTable from a file of wsp-separated columns, first line is -- a header, and only columns "Pattern" and "Name" are used. readPatternTable :: FilePath -> IO LibTable readPatternTable f = do (h:ls) <- return . map words . lines =<< readFile f let (p,n) = case (elemIndex "Pattern" h, elemIndex "Name" h) of (Just x,Just y) -> (x,y) _ -> error ("Need both 'Pattern' and 'Name' headers in '"++f++"'") z l | length l < max p n = error ("Line in library table too short:\n"++show l) | otherwise = (mkRegex (l!!p),l!!n) return $ map z ls -- -------------------------------------------------- -- | Acquire total counts by library from a clustering totalsByLib :: LibTable -> FilePath -> IO (String,[Int]) totalsByLib lt f = do cs <- readClusters f let unify cs = [("total", concatMap snd cs)] case countClusters lt (unify cs) of [x] -> return x _ -> error "totalsByLib: this is impossible" -- -------------------------------------------------- -- | Parse clusters readClusters :: FilePath -> IO [Cluster] readClusters f = readFile f >>= return . rc . lines where rc [] = [] rc [_] = error "odd number of cluster lines!" rc (c:ss:rest) = case head $ words c of ('>':name) -> (name,words ss) : rc rest _ -> error ("Cluster '"++c++"' didn't start with '>'") classify :: LibTable -> String -> String classify ps str = case concatMap (class1 str) ps of [] -> error ("no match for "++str++" in library table") [x] -> x s@(_:_) -> error ("multiple matches for "++str++": "++show s) where class1 st (r,s) = maybe [] (const [s]) (matchRegex r st) -- -------------------------------------------------- -- will need to match against all, to check for multiple matches -- tag names with library classClusters :: LibTable -> FilePath -> IO [Cluster] classClusters ps f = return . map class1 =<< readClusters f where class1 (l,ss) = (l, map (\s -> classify ps s++":"++s) ss) -- | Output the clusters to stdout writeTable :: [(String,[String])] -> IO () writeTable = putStrLn . unlines . map show1 where show1 (name,stuff) = concat $ intersperse "\t" (name:stuff) -- count by classification countClusters :: LibTable -> [Cluster] -> [(String,[Int])] countClusters lt = map count1 where count1 (c,ss) = (c,map (\k -> findWithDefault 0 k (counts ss)) (map snd lt)) counts ss = fromListWith (+) $ zip (map (classify lt) ss) $ repeat 1