{-# LANGUAGE OverloadedStrings, RecordWildCards, FlexibleContexts, BangPatterns #-} -- | Metadata necessary for a sensible genotyping workflow. module Bio.Genocall.Metadata where import Bio.Genocall.Adna ( DamageParameters(..) ) import Control.Applicative hiding ( empty ) import Control.Concurrent ( threadDelay ) import Control.Exception ( bracket, onException, handleJust ) import Control.Monad ( forM_ ) import Data.Text ( Text, pack ) import Data.HashMap.Strict ( HashMap ) import Data.Aeson import Data.ByteString.Char8 ( readFile ) import Data.ByteString.Lazy ( toChunks ) import Data.ByteString.Unsafe ( unsafeUseAsCStringLen ) import Data.Monoid import Data.Vector.Unboxed ( Vector ) import Foreign.Ptr ( castPtr ) import GHC.IO.Exception ( IOErrorType(..) ) import Prelude hiding ( writeFile, readFile ) import System.IO.Error ( isAlreadyExistsErrorType, ioeGetErrorType ) import System.Posix.Files ( rename, removeLink ) import System.Posix.IO import qualified Data.HashMap.Strict as M data Sample = Sample { sample_libraries :: [Library], sample_avro_files :: HashMap Text Text, -- ^ maps a region to the av file sample_bcf_files :: HashMap Text Text, -- ^ maps a region to the bcf file sample_div_tables :: HashMap Text (Double, Vector Int), -- ^ maps a region to the table needed for div. estimation sample_divergences :: HashMap Text DivEst } deriving Show data Library = Library { library_name :: Text, library_files :: [Text], library_damage :: Maybe (DamageParameters Double) } deriving Show -- | Divergence estimate. Lists contain three or four floats, these are -- divergence, heterozygosity at W sites, heterozygosity at S sites, and -- optionally gappiness in this order. data DivEst = DivEst { point_est :: [Double], conf_region :: [( [Double], [Double] )] } deriving Show type Metadata = HashMap Text Sample instance ToJSON DivEst where toJSON DivEst{..} = object $ [ "estimate" .= point_est , "confidence-region" .= conf_region ] instance FromJSON DivEst where parseJSON (Object o) = DivEst <$> o .: "estimate" <*> o .:? "confidence-region" .!= [] parseJSON (Array a) = flip DivEst [] <$> parseJSON (Array a) parseJSON _ = fail $ "divergence estimate should be an array or an object" instance ToJSON float => ToJSON (DamageParameters float) where toJSON DP{..} = object [ "ss-sigma" .= ssd_sigma , "ss-delta" .= ssd_delta , "ss-lambda" .= ssd_lambda , "ss-kappa" .= ssd_kappa , "ds-sigma" .= dsd_sigma , "ds-delta" .= dsd_delta , "ds-lambda" .= dsd_lambda ] instance FromJSON float => FromJSON (DamageParameters float) where parseJSON = withObject "damage parameters" $ \o -> DP <$> o .: "ss-sigma" <*> o .: "ss-delta" <*> o .: "ss-lambda" <*> o .: "ss-kappa" <*> o .: "ds-sigma" <*> o .: "ds-delta" <*> o .: "ds-lambda" instance ToJSON Library where toJSON (Library name files dp) = object ( maybe id ((:) . ("damage" .=)) dp $ [ "name" .= name, "files" .= files ] ) instance FromJSON Library where parseJSON (String name) = return $ Library name [name <> ".bam"] Nothing parseJSON (Object o) = Library <$> o .: "name" <*> (maybe id (:) <$> o .:? "file" <*> o .:? "files" .!= []) <*> o .:? "damage" parseJSON _ = fail "String or Object expected for library" instance ToJSON Sample where toJSON (Sample ls avfs bcfs dts ds) = object $ hashToJson "divergences" ds $ listToJson "libraries" ls $ hashToJson "avro-files" avfs $ hashToJson "bcf-files" bcfs $ hashToJson "div-tables" dts [] where hashToJson k vs = if M.null vs then id else (:) (k .= vs) listToJson k vs = if null vs then id else (:) (k .= vs) instance FromJSON Sample where parseJSON (String s) = pure $ Sample [Library s [s <> ".bam"] Nothing] M.empty M.empty M.empty M.empty parseJSON (Array ls) = (\ll -> Sample ll M.empty M.empty M.empty M.empty) <$> parseJSON (Array ls) parseJSON (Object o) = Sample <$> o .: "libraries" <*> (M.singleton "" <$> o .: "avro-file" <|> o .:? "avro-files" .!= M.empty) <*> (M.singleton "" <$> o .: "bcf-file" <|> o .:? "bcf-files" .!= M.empty) <*> o .:? "div-tables" .!= M.empty <*> (M.singleton "" <$> o .: "divergence" <|> o.:? "divergences" .!= M.empty) parseJSON _ = fail $ "String, Array or Object expected for Sample" -- | Read the configuration file. Retries, because NFS tends to result -- in 'ResourceVanished' if the file is replaced while we try to read it. readMetadata :: FilePath -> IO Metadata readMetadata fn = either error return . eitherDecodeStrict =<< go (15::Int) where go !n = handleJust -- retry every sec for 15 seconds (\e -> case ioeGetErrorType e of ResourceVanished | n > 0 -> Just () ; _ -> Nothing) (\_ -> threadDelay 1000000 >> go (n-1)) (readFile fn) -- | Update the configuration file. Open a new file (fn++"~new") in -- exclusive mode. Then read the old file, write the update to the new -- file, rename it atomically, then close it. Use of O_EXCL should -- ensure that nobody interferes. This is atomic even on NFS, provided -- NFS and kernel are new enough. For older NFS, I cannot be bothered. -- -- (The first idea was to base this on the supposed fact that link(2) is -- atomic and fails if the new filename exists. This approach does seem -- to contain a race condition, though.) updateMetadata :: (Metadata -> Metadata) -> FilePath -> IO () updateMetadata f fp = go (360::Int) -- retry every 5 secs for 30 minutes where fpn = fp <> "~new" go !n = handleJust (\e -> if isAlreadyExistsErrorType (ioeGetErrorType e) && n > 0 then Just () else Nothing) (\_ -> threadDelay 5000000 >> go (n-1)) $ do bracket (openFd fpn WriteOnly (Just 0o666) defaultFileFlags{ exclusive = True }) (closeFd) $ \fd -> (do mdata <- readMetadata fp forM_ (toChunks . encode . toJSON $ f mdata) $ \ch -> unsafeUseAsCStringLen ch $ \(p,l) -> fdWriteBuf fd (castPtr p) (fromIntegral l) rename fpn fp) `onException` removeLink fpn split_sam_rgns :: Metadata -> [String] -> [( String, [Maybe String] )] split_sam_rgns _meta [ ] = [] split_sam_rgns meta (s:ss) = (s, if null rgns then [Nothing] else map Just rgns) : split_sam_rgns meta rest where (rgns, rest) = break (\x -> pack x `M.member` meta) ss