-- | -- Module : ConClusion.Chemistry.Topology -- Description : Principal Component Analysis -- Copyright : Phillip Seeber, 2021 -- License : AGPL-3 -- Maintainer : phillip.seeber@googlemail.com -- Stability : experimental -- Portability : POSIX, Windows -- -- This module implements routines to work with simple molden style XYZ trajectories. This includes -- parsers as well as functions to obtain structural features in internal coordinates. -- -- For an introduction into PCA see . -- -- Diherdrals require a special metric, see . module ConClusion.Chemistry.Topology ( Molecule, Trajectory, xyz, trajectory, B (..), A (..), D (..), Feature (..), getFeatures, ) where import ConClusion.Numeric.Data hiding (angle) import qualified ConClusion.Numeric.Data as ND import Data.Attoparsec.Text hiding (D) import Data.Foldable import Data.Massiv.Array as Massiv hiding (B, D, forM) import qualified Numeric.LinearAlgebra as LA import RIO import qualified RIO.Seq as Seq import qualified RIO.Text as Text import qualified RIO.Vector.Boxed as VectorB -- | A Molecule in cartesian coordinates. data Molecule = Molecule { -- | The energy of the molecule. energy :: !Double, -- | Chemical symbols or names of the atoms. \(N\) vector. atoms :: !(VectorB.Vector Text), -- | Cartesian coordinates. Atoms as rows, xyz as columns. \(N \times 3\) matrix. coordinates :: !(Massiv.Matrix S Double) } type Trajectory = Seq Molecule -- | Parser for molecules in Molden XYZ format. xyz :: Parser Molecule xyz = do -- Starting line with number of atoms. nAtoms <- do _ <- skipMany . char $ ' ' n <- decimal _ <- skipMany . char $ ' ' endOfLine return n -- Comment line. Must contain the energy. energy <- do _ <- skipMany . char $ ' ' e <- double _ <- skipMany . char $ ' ' endOfLine return e -- Multiple lines of atoms. atoms <- count nAtoms $ do _ <- skipMany . char $ ' ' name <- many1 $ letter <|> digit _ <- skipMany1 . char $ ' ' x <- double _ <- skipMany1 . char $ ' ' y <- double _ <- skipMany1 . char $ ' ' z <- double _ <- skipMany . char $ ' ' endOfLine let row = Massiv.expandOuter (Sz 1) const . Massiv.fromList @S Seq $ [x, y, z] return (Text.pack name, row) return Molecule { energy = energy, atoms = VectorB.fromListN nAtoms . fmap fst $ atoms, coordinates = Massiv.compute . Massiv.concat' (Dim 2) . fmap snd $ atoms } -- | Parser for trajectories in XYZ format as produced by CREST. {-# SCC trajectory #-} trajectory :: Parser Trajectory trajectory = do mols <- Seq.fromList <$> many1 xyz skipSpace endOfInput return mols -- | Selection of a bond between two atoms. data B = B Int Int -- | Selection of an angle between three atoms. data A = A Int Int Int -- | Selection of a dihedral angle between four atoms. Rotation around the central two. data D = D Int Int Int Int -- | Selections data Feature = Energy | Bond B | Angle A | Dihedral D -- | Calculate a bond distance. bond :: MonadThrow m => B -> Molecule -> m Double bond (B a b) Molecule {coordinates} | a == b = throwM . IndexException $ "selected atoms are identical" | otherwise = do coordA <- compute @U <$> (coordinates !?> a) coordB <- compute @U <$> (coordinates !?> b) vecAB <- coordA .-. coordB return . sqrt . Massiv.sum . Massiv.map (^ (2 :: Int)) $ vecAB -- | Calculates the sinus of an angle defined by three atoms. angle :: MonadThrow m => A -> Molecule -> m Double angle (A a b c) Molecule {coordinates} | a == b || b == c || a == c = throwM . IndexException $ "selected atoms are identical" | otherwise = do coordA <- compute @U <$> (coordinates !?> a) coordB <- compute @U <$> (coordinates !?> b) coordC <- compute @U <$> (coordinates !?> c) vecAB <- coordA .-. coordB vecCB <- coordC .-. coordB return $ ND.angle vecAB vecCB -- | Calculates the dihedral angle defined by four atoms. Respects rotation direction. Obtains the -- result in radian. dihedral' :: MonadThrow m => D -> Molecule -> m Double dihedral' (D a b c d) Molecule {coordinates} | a == b || a == c || a == d || b == c || b == d || c == d = throwM . IndexException $ "selected atoms are indentical" | otherwise = do coordA <- compute @U <$> (coordinates !?> a) coordB <- compute @U <$> (coordinates !?> b) coordC <- compute @U <$> (coordinates !?> c) coordD <- compute @U <$> (coordinates !?> d) vecAB <- coordA .-. coordB vecBC <- coordB .-. coordC vecCD <- coordC .-. coordD let planeABC = vecH2M $ LA.cross (vecM2H vecAB) (vecM2H vecBC) :: Massiv.Vector U Double planeBCD = vecH2M $ LA.cross (vecM2H vecBC) (vecM2H vecCD) :: Massiv.Vector U Double normVecRot = LA.cross (vecM2H vecCD) (vecM2H vecBC) :: LA.Vector Double rotDir = if vecH2M normVecRot !.! vecAB < 0 then -1 else 1 return $ rotDir * ND.angle planeABC planeBCD -- | Calculates the a metric value of the dihedral angle defined by four atoms. This must create 2 -- values in the feature matrix, instead of one. -- See dihedral :: MonadThrow m => D -> Molecule -> m (Double, Double) dihedral d mol = do dihedRad <- dihedral' d mol return (sin dihedRad, cos dihedRad) -- | Get all selected features from a molecule. getFeature :: ( Traversable t, MonadThrow m ) => -- | Selection of multiple features. t Feature -> -- | A Molecule where to apply to. Molecule -> m (Massiv.Vector Massiv.DL Double) getFeature sel mol = do features <- for sel $ \s -> case s of Energy -> return . Left . energy $ mol Bond b -> Left <$> bond b mol Angle a -> Left <$> angle a mol Dihedral d -> Right <$> dihedral d mol let featureVec = foldl ( \acc val -> case val of Left s -> acc `Massiv.snoc` s Right (a, b) -> acc `Massiv.snoc` a `Massiv.snoc` b ) mempty features return featureVec -- | Obtains the feature matrix \(\mathbf{X}\) for a principal component analysis. Given \(m\) -- features to analyse in \(n\) measurements, \(\mathbf{X}\) will be a \(m \times n\) matrix. {-# SCC getFeatures #-} getFeatures :: ( MonadThrow m, Traversable f ) => f Feature -> Trajectory -> m (Massiv.Matrix DL Double) getFeatures sel trj = traverse toCols trj >>= concatM (Dim 1) where toCols v = expandInner @Ix2 (Sz 1) const . compute @U <$> getFeature sel v