{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TemplateHaskell #-} {-# OPTIONS_GHC -Wall #-} module Codec.Xlsx.Types.SharedStringTable ( -- * Main types SharedStringTable(..) , StringItem(..) , sstConstruct , sstLookupText , sstLookupRich , sstItem -- * Lenses -- ** SharedStringTable , sharedStringTable ) where import Control.Lens hiding (element) import Control.Monad import Data.Maybe (mapMaybe) import Data.Text (Text) import Data.Vector (Vector) import Numeric.Search.Range (searchFromTo) import Text.XML import Text.XML.Cursor import qualified Data.Map as Map import qualified Data.Set as Set import qualified Data.Vector as V import Codec.Xlsx.Parser.Internal import Codec.Xlsx.Types import Codec.Xlsx.Writer.Internal -- | Shared string table -- -- A workbook can contain thousands of cells containing string (non-numeric) -- data. Furthermore this data is very likely to be repeated across many rows or -- columns. The goal of implementing a single string table that is shared across -- the workbook is to improve performance in opening and saving the file by only -- reading and writing the repetitive information once. -- -- Relevant parts of the EMCA standard (2nd edition, part 1, -- ), -- page numbers refer to the page in the PDF rather than the page number as -- printed on the page): -- -- * Section 18.4, "Shared String Table" (p. 1712) -- in particular subsection 18.4.9, "sst (Shared String Table)" (p. 1726) -- -- TODO: The @extLst@ child element is currently unsupported. data SharedStringTable = SharedStringTable { _sharedStringTable :: Vector StringItem } deriving (Show, Eq, Ord) -- | String Item -- -- This element is the representation of an individual string in the Shared -- String table. -- -- The spec says: "If the string is just a simple string with formatting applied -- at the cell level, then the String Item (si) should contain a single text -- element used to express the string. However, if the string in the cell is -- more complex - i.e., has formatting applied at the character level - then the -- string item shall consist of multiple rich text runs which collectively are -- used to express the string.". So we have either a single "Text" field, or -- else a list of "RichTextRun"s, each of which is some "Text" with layout -- properties. -- -- TODO: Currently we do not support @phoneticPr@ (Phonetic Properties, 18.4.3, -- p. 1723) or @rPh@ (Phonetic Run, 18.4.6, p. 1725). -- -- Section 18.4.8, "si (String Item)" (p. 1725) data StringItem = StringItemText Text | StringItemRich [RichTextRun] deriving (Show, Eq, Ord) {------------------------------------------------------------------------------- Lenses -------------------------------------------------------------------------------} makeLenses ''SharedStringTable {------------------------------------------------------------------------------- Rendering -------------------------------------------------------------------------------} instance ToDocument SharedStringTable where toDocument = documentFromElement "Shared string table generated by xlsx" . toElement "sst" -- | See @CT_Sst@, p. 3902. -- -- TODO: The @count@ and @uniqCount@ attributes are currently unsupported. instance ToElement SharedStringTable where toElement nm SharedStringTable{..} = Element { elementName = nm , elementAttributes = Map.empty , elementNodes = map (NodeElement . toElement "si") $ V.toList _sharedStringTable } -- | See @CT_Rst@, p. 3903 instance ToElement StringItem where toElement nm si = Element { elementName = nm , elementAttributes = Map.empty , elementNodes = map NodeElement $ case si of StringItemText text -> [elementContent "t" text] StringItemRich rich -> map (toElement "r") rich } {------------------------------------------------------------------------------- Parsing -------------------------------------------------------------------------------} -- | See @CT_Sst@, p. 3902 -- -- The optional attributes @count@ and @uniqCount@ are being ignored at least currently instance FromCursor SharedStringTable where fromCursor cur = do let items = cur $/ element (n"si") >=> fromCursor return (SharedStringTable (V.fromList items)) -- | See @CT_Rst@, p. 3903 instance FromCursor StringItem where fromCursor cur = do let ts = cur $/ element (n"t") >=> contentOrEmpty contentOrEmpty c = case c $/ content of [t] -> [t] [] -> [""] _ -> error "invalid item: more than one text nodes under !" rs = cur $/ element (n"r") >=> fromCursor case (ts,rs) of ([t], []) -> return $ StringItemText t ([], _:_) -> return $ StringItemRich rs _ -> fail "invalid item" {------------------------------------------------------------------------------- Extract shared strings -------------------------------------------------------------------------------} -- | Construct the 'SharedStringsTable' from an existing document sstConstruct :: [Worksheet] -> SharedStringTable sstConstruct = SharedStringTable . V.fromList . uniq . concatMap goSheet where goSheet :: Worksheet -> [StringItem] goSheet = mapMaybe (_cellValue >=> sstEntry) . Map.elems . _wsCells sstEntry :: CellValue -> Maybe StringItem sstEntry (CellText text) = Just $ StringItemText text sstEntry (CellRich rich) = Just $ StringItemRich rich sstEntry _ = Nothing uniq :: Ord a => [a] -> [a] uniq = Set.elems . Set.fromList sstLookupText :: SharedStringTable -> Text -> Int sstLookupText sst = sstLookup sst . StringItemText sstLookupRich :: SharedStringTable -> [RichTextRun] -> Int sstLookupRich sst = sstLookup sst . StringItemRich -- | Internal generalization used by 'sstLookupText' and 'sstLookupRich' sstLookup :: SharedStringTable -> StringItem -> Int sstLookup SharedStringTable{_sharedStringTable = shared} si = case searchFromTo (\p -> shared V.! p >= si) 0 (V.length shared - 1) of Just i -> i Nothing -> error $ "SST entry for " ++ show si ++ " not found" sstItem :: SharedStringTable -> Int -> StringItem sstItem SharedStringTable{_sharedStringTable = shared} = (V.!) shared