src/Data/Dwarf.hs

{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
-- | Parses the DWARF 2 and DWARF 3 specifications at http://www.dwarfstd.org given
-- the debug sections in ByteString form.
module Data.Dwarf
  ( Endianess(..), TargetSize(..)
  , Sections(..)
  , parseInfo
  , DieID, dieID, DIE(..), (!?)
  , DIERefs(..), DIEMap
  , Reader(..)
  , parseAranges
  , parsePubnames
  , parsePubtypes
  , Range(..), parseRanges, parseLoc
  , DW_CFA(..)
  , DW_MACINFO(..), parseMacInfo
  , DW_CIEFDE(..), parseFrame
  , DW_OP(..), parseDW_OP
  , DW_TAG(..)
  , DW_AT(..)
  , DW_ATVAL(..)
  , DW_LNE(..), parseLNE
  , DW_ATE(..), dw_ate
  , DW_DS(..), dw_ds
  , DW_END(..), dw_end
  , DW_ACCESS(..), dw_access
  , DW_VIS(..), dw_vis
  , DW_VIRTUALITY(..), dw_virtuality
  , DW_LANG(..), dw_lang
  , DW_ID(..), dw_id
  , DW_INL(..), dw_inl
  , DW_CC(..), dw_cc
  , DW_ORD(..), dw_ord
  , DW_DSC(..), dw_dsc
  ) where

import           Control.Applicative (Applicative(..), (<$>), (<$))
import           Control.Arrow ((&&&), (***))
import           Control.Monad ((<=<))
import           Control.Monad.Trans.Class (lift)
import           Control.Monad.Trans.Writer (WriterT(..))
import qualified Control.Monad.Trans.Writer as Writer
import           Data.Binary (Get)
import           Data.Binary.Get (getWord8, getByteString)
import qualified Data.Binary.Get as Get
import qualified Data.ByteString as B
import           Data.Dwarf.AT
import           Data.Dwarf.ATE
import           Data.Dwarf.CFA
import           Data.Dwarf.Form
import           Data.Dwarf.LNI
import           Data.Dwarf.OP
import           Data.Dwarf.Reader
import           Data.Dwarf.TAG
import           Data.Dwarf.Types
import           Data.Dwarf.Utils
import           Data.Int (Int64)
import qualified Data.Map as M
import           Data.Maybe (listToMaybe)
import           Data.String (IsString(..))
import           Data.Text (Text)
import qualified Data.Text as Text
import           Data.Traversable (traverse)
import           Data.Word (Word64)
import           GHC.Generics (Generic)
import           Numeric (showHex)
import           TextShow (TextShow(..))
import           TextShow.Generic (genericShowbPrec)

newtype CUOffset = CUOffset Word64
  deriving (Eq, Ord, Read, Show, Generic)

instance TextShow CUOffset where showbPrec = genericShowbPrec

-- Don't export a constructor, so users can only read DieID's, not
-- create fake ones, which is slightly safer.
dieID :: DieID -> Word64
dieID (DieID x) = x

inCU :: Integral a => CUOffset -> a -> DieID
inCU (CUOffset base) x = DieID $ base + fromIntegral x

data Sections = Sections
  { dsInfoSection :: B.ByteString
  , dsAbbrevSection :: B.ByteString
  , dsStrSection :: B.ByteString
  }

data CUContext = CUContext
  { cuOffset :: CUOffset
  , cuAbbrevMap :: M.Map AbbrevId DW_ABBREV
  , cuReader :: Reader
  , cuSections :: Sections
  }

---------------------------------------------------------------------------------------------------------------------------------------------------------------
-- Abbreviation and form parsing
---------------------------------------------------------------------------------------------------------------------------------------------------------------
newtype AbbrevId = AbbrevId Word64
  deriving (Eq, Ord, Read, Show, Generic)

instance TextShow AbbrevId where showbPrec = genericShowbPrec

data DW_ABBREV = DW_ABBREV
    { abbrevId        :: AbbrevId
    , abbrevTag       :: DW_TAG
    , abbrevChildren  :: Bool
    , abbrevAttrForms :: [(DW_AT, DW_FORM)]
    }

getMAbbrevId :: Get (Maybe AbbrevId)
getMAbbrevId = do
  i <- getULEB128
  pure $
    if i == 0
    then Nothing
    else Just $ AbbrevId i

getAbbrevList :: Get [DW_ABBREV]
getAbbrevList =
  whileJust $ traverse getAbbrev =<< getMAbbrevId
  where
    getAbbrev abbrev = do
      tag       <- getDW_TAG
      children  <- (== 1) <$> getWord8
      attrForms <- getAttrFormList
      pure $ DW_ABBREV abbrev tag children attrForms
    getAttrFormList =
      (fmap . map) (dw_at *** dw_form) . whileM (/= (0,0)) $
      (,) <$> getULEB128 <*> getULEB128


---------------------------------------------------------------------------------------------------------------------------------------------------------------
-- DWARF information entry and .debug_info section parsing.
---------------------------------------------------------------------------------------------------------------------------------------------------------------

-- | Utility function for retrieving the list of values for a specified attribute from a DWARF information entry.
(!?) :: DIE -> DW_AT -> [DW_ATVAL]
(!?) die at = map snd $ filter ((== at) . fst) $ dieAttributes die

getNonZeroOffset :: Reader -> Get (Maybe Word64)
getNonZeroOffset dr = do
  offset <- drGetOffset dr
  pure $ if offset == 0 then Nothing else Just offset

-- Section 7.19 - Name Lookup Tables
getNameLookupEntries :: Reader -> CUOffset -> Get [(Text, [DieID])]
getNameLookupEntries dr cu_offset =
  whileJust $ traverse getEntry =<< getNonZeroOffset dr
  where
    getEntry die_offset = do
      name <- getUTF8Str0
      pure (name, [inCU cu_offset die_offset])

-- The headers for "Section 7.19 Name Lookup Table", and "Section 7.20
-- Address Range Table" are very similar, this is the common format:
getTableHeader :: TargetSize -> EndianReader -> Get (Reader, CUOffset)
getTableHeader target64 der = do
  (desr, _) <- getUnitLength der
  let dr = reader target64 desr
  _version <- drGetW16 dr
  cu_offset <- drGetOffset dr
  return (dr, CUOffset cu_offset)

getNameLookupTable :: TargetSize -> EndianReader -> Get [M.Map Text [DieID]]
getNameLookupTable target64 der = getWhileNotEmpty $ do
  (dr, cu_offset) <- getTableHeader target64 der
  _debug_info_length <- drGetOffset dr
  M.fromListWith (++) <$> getNameLookupEntries dr cu_offset

parsePubSection :: Endianess -> TargetSize -> B.ByteString -> M.Map Text [DieID]
parsePubSection endianess target64 section =
  M.unionsWith (++) $ strictGet (getNameLookupTable target64 der) section
  where
    der = endianReader endianess

-- | Parses the .debug_pubnames section (as ByteString) into a map from a value name to a DieID
parsePubnames :: Endianess -> TargetSize -> B.ByteString -> M.Map Text [DieID]
parsePubnames = parsePubSection

-- | Parses the .debug_pubtypes section (as ByteString) into a map from a type name to a DieID
parsePubtypes :: Endianess -> TargetSize -> B.ByteString -> M.Map Text [DieID]
parsePubtypes = parsePubSection

align :: Integral a => a -> Get ()
align alignment = do
  pos <- Get.bytesRead
  Get.skip . fromIntegral $ (-pos) `mod` fromIntegral alignment

data Range = Range
  { rangeBegin :: !Word64
  , rangeEnd :: !Word64
  } deriving (Eq, Ord, Read, Show, Generic)

instance TextShow Range where showbPrec = genericShowbPrec

-- Section 7.20 - Address Range Table
-- Returns the ranges that belong to a CU
getAddressRangeTable :: TargetSize -> EndianReader -> Get [([Range], CUOffset)]
getAddressRangeTable target64 der = getWhileNotEmpty $ do
  (dr, cu_offset)   <- getTableHeader target64 der
  address_size      <- getWord8
  let
    readAddress =
      case address_size of
        4 -> fromIntegral <$> drGetW32 dr
        8 -> drGetW64 dr
        n -> fail $ "Unrecognized address size " ++ show n ++ " in .debug_aranges section."
  _segment_size     <- getWord8
  align $ 2 * address_size
  address_ranges <- whileM (/= Range 0 0) $ Range <$> readAddress <*> readAddress
  pure (address_ranges, cu_offset)

-- | Parses the .debug_aranges section (as ByteString) into a map from
-- an address range to a DieID that indexes the Info.
parseAranges ::
  Endianess -> TargetSize -> B.ByteString -> [([Range], CUOffset)]
parseAranges endianess target64 aranges_section =
    let dr = endianReader endianess
    in strictGet (getAddressRangeTable target64 dr) aranges_section

{-# ANN module ("HLint: ignore Use camelCase"::String) #-}

-- Section 7.21 - Macro Information
data DW_MACINFO
    = DW_MACINFO_define Word64 Text       -- ^ Line number and defined symbol with definition
    | DW_MACINFO_undef Word64 Text        -- ^ Line number and undefined symbol
    | DW_MACINFO_start_file Word64 Word64 -- ^ Marks start of file with the line where the file was included from and a source file index
    | DW_MACINFO_end_file                 -- ^ Marks end of file
    | DW_MACINFO_vendor_ext Word64 Text   -- ^ Implementation defined
    deriving (Eq, Ord, Read, Show, Generic)

instance TextShow DW_MACINFO where showbPrec = genericShowbPrec

-- | Retrieves the macro information for a compilation unit from a given substring of the .debug_macinfo section. The offset
-- into the .debug_macinfo section is obtained from the DW_AT_macro_info attribute of a compilation unit DIE.
parseMacInfo :: B.ByteString -> [DW_MACINFO]
parseMacInfo = strictGet getMacInfo

getMacInfo :: Get [DW_MACINFO]
getMacInfo = do
    x <- getWord8
    case x of
        0x00 -> pure []
        0x01 -> pure (:) <*> (pure DW_MACINFO_define     <*> getULEB128 <*> getUTF8Str0) <*> getMacInfo
        0x02 -> pure (:) <*> (pure DW_MACINFO_undef      <*> getULEB128 <*> getUTF8Str0) <*> getMacInfo
        0x03 -> pure (:) <*> (pure DW_MACINFO_start_file <*> getULEB128 <*> getULEB128)  <*> getMacInfo
        0x04 -> pure (:) <*>  pure DW_MACINFO_end_file                                   <*> getMacInfo
        0xff -> pure (:) <*> (pure DW_MACINFO_vendor_ext <*> getULEB128 <*> getUTF8Str0) <*> getMacInfo
        _ -> fail $ "Invalid MACINFO id: " ++ show x

data DW_CIEFDE
    = DW_CIE
        { cieAugmentation          :: Text
        , cieCodeAlignmentFactor   :: Word64
        , cieDataAlignmentFactor   :: Int64
        , cieReturnAddressRegister :: Word64
        , cieInitialInstructions   :: [DW_CFA]
        }
    | DW_FDE
        { fdeCiePointer      :: Word64
        , fdeInitialLocation :: Word64
        , fdeAddressRange    :: Word64
        , fdeInstructions    :: [DW_CFA]
        }
    deriving (Eq, Ord, Read, Show, Generic)

instance TextShow DW_CIEFDE where showbPrec = genericShowbPrec

getCIEFDE :: Endianess -> TargetSize -> Get DW_CIEFDE
getCIEFDE endianess target64 = do
    let der    = endianReader endianess
    (desr, endPos) <- getUnitLength der
    let dr     = reader target64 desr
    cie_id     <- drGetOffset dr
    if cie_id == drLargestOffset dr then do
        version                 <- getWord8
        augmentation            <- getUTF8Str0
        code_alignment_factor   <- getULEB128
        data_alignment_factor   <- getSLEB128
        return_address_register <- case version of
                                    1 -> fromIntegral <$> getWord8
                                    3 -> getULEB128
                                    n -> fail $ "Unrecognized CIE version " ++ show n
        curPos                  <- fromIntegral <$> Get.bytesRead
        raw_instructions        <- getByteString $ fromIntegral (endPos - curPos)
        let initial_instructions = strictGet (getWhileNotEmpty (getDW_CFA dr)) raw_instructions
        pure $ DW_CIE augmentation code_alignment_factor data_alignment_factor return_address_register initial_instructions
     else do
        initial_location        <- drGetTargetAddress dr
        address_range           <- drGetTargetAddress dr
        curPos                  <- fromIntegral <$> Get.bytesRead
        raw_instructions        <- getByteString $ fromIntegral (endPos - curPos)
        let instructions        = strictGet (getWhileNotEmpty (getDW_CFA dr)) raw_instructions
        pure $ DW_FDE cie_id initial_location address_range instructions

-- | Parse the .debug_frame section into a list of DW_CIEFDE records.
parseFrame ::
  Endianess -> TargetSize
  -> B.ByteString -- ^ ByteString for the .debug_frame section.
  -> [DW_CIEFDE]
parseFrame endianess target64 =
  strictGet . getWhileNotEmpty $ getCIEFDE endianess target64

newtype RangeEnd = RangeEnd Word64

-- Section 7.23 - Non-contiguous Address Ranges
-- | Retrieves the non-contiguous address ranges for a compilation unit from a given substring of the .debug_ranges section. The offset
-- into the .debug_ranges section is obtained from the DW_AT_ranges attribute of a compilation unit DIE.
-- Left results are base address entries. Right results are address ranges.
parseRanges :: Reader -> B.ByteString -> [Either RangeEnd Range]
parseRanges = strictGet . getRanges

getMRange :: Reader -> Get (Maybe (Either RangeEnd Range))
getMRange dr = do
  begin <- drGetTargetAddress dr
  end   <- drGetTargetAddress dr
  pure $
    if begin == 0 && end == 0
    then Nothing
    else Just $
      if begin == drLargestTargetAddress dr
      then Left $ RangeEnd end
      else Right $ Range begin end

getRanges :: Reader -> Get [Either RangeEnd Range]
getRanges dr = whileJust $ getMRange dr

-- Section 7.7.3
-- | Retrieves the location list expressions from a given substring of the .debug_loc section. The offset
-- into the .debug_loc section is obtained from an attribute of class loclistptr for a given DIE.
-- Left results are base address entries. Right results are address ranges and a location expression.
parseLoc :: Reader -> B.ByteString -> [Either RangeEnd (Range, B.ByteString)]
parseLoc dr = strictGet (getLoc dr)

getLoc :: Reader -> Get [Either RangeEnd (Range, B.ByteString)]
getLoc dr = whileJust $ traverse mkRange =<< getMRange dr
  where
    mkRange (Left end) = pure $ Left end
    mkRange (Right range) =
      Right . (,) range <$> getByteStringLen (drGetW16 dr)

data DIERefs = DIERefs
  { dieRefsParent       :: Maybe DieID   -- ^ Unique identifier of this entry's parent.
  , dieRefsSiblingLeft  :: Maybe DieID   -- ^ Unique identifier of the left sibling
  , dieRefsSiblingRight :: Maybe DieID   -- ^ Unique identifier of the right sibling
  , dieRefsDIE :: DIE
  } deriving (Show, Generic)

type DIEMap = M.Map DieID DIERefs
type DIECollector = WriterT DIEMap

-- | The dwarf information entries form a graph of nodes tagged with attributes. Please refer to the DWARF specification
-- for semantics. Although it looks like a tree, there can be attributes which have adjacency information which will
-- introduce cross-branch edges.
data DIE = DIE
    { dieId         :: DieID              -- ^ Unique identifier for this entry.
    , dieTag        :: DW_TAG              -- ^ Type tag.
    , dieAttributes :: [(DW_AT, DW_ATVAL)] -- ^ Attribute tag and value pairs.
    , dieChildren   :: [DIE]
    , dieReader     :: Reader         -- ^ Decoder used to decode this entry. May be needed to further parse attribute values.
    }
instance Show DIE where show = Text.unpack . showt
instance TextShow DIE where
    showb (DIE (DieID i) tag attrs children _) =
        mconcat $ mconcat
        [ [ "DIE@", fromString (showHex i ""), "{", showb tag, " (", showb (length children), " children)"]
        , mconcat
          [ [" ", showb attr, "=(", showb val, ")"]
          | (attr, val) <- attrs
          ]
        , [ "}" ]
        ]

addRefs :: Maybe DieID -> [DIE] -> [DIERefs]
addRefs mParent = go Nothing
  where
    go _lSibling [] = []
    go lSibling (die : xs) =
      DIERefs mParent lSibling (dieId <$> listToMaybe xs) die :
      go (Just (dieId die)) xs

withToldRefs :: (Applicative m, Monad m) => Maybe DieID -> [DIE] -> DIECollector m [DIE]
withToldRefs mParent dies =
  dies <$
  (Writer.tell . M.fromList . map (dieId . dieRefsDIE &&& id) . addRefs mParent) dies

-- Decode a non-compilation unit DWARF information entry, its children and its siblings.
getDieAndSiblings :: DieID -> CUContext -> DIECollector Get [DIE]
getDieAndSiblings parent cuContext =
  withToldRefs (Just parent) =<< (whileJust . getDIEAndDescendants) cuContext

getForm :: CUContext -> DW_FORM -> Get DW_ATVAL
getForm
  cuContext@CUContext { cuReader = dr, cuOffset = cu, cuSections = dc }
  form
  = case form of
    DW_FORM_addr         -> DW_ATVAL_UINT <$> drGetTargetAddress dr
    DW_FORM_block1       -> DW_ATVAL_BLOB <$> getByteStringLen getWord8
    DW_FORM_block2       -> DW_ATVAL_BLOB <$> getByteStringLen (drGetW16 dr)
    DW_FORM_block4       -> DW_ATVAL_BLOB <$> getByteStringLen (drGetW32 dr)
    DW_FORM_block        -> DW_ATVAL_BLOB <$> getByteStringLen getULEB128
    DW_FORM_data1        -> DW_ATVAL_UINT . fromIntegral <$> getWord8
    DW_FORM_data2        -> DW_ATVAL_UINT . fromIntegral <$> drGetW16 dr
    DW_FORM_data4        -> DW_ATVAL_UINT . fromIntegral <$> drGetW32 dr
    DW_FORM_data8        -> DW_ATVAL_UINT <$> drGetW64 dr
    DW_FORM_udata        -> DW_ATVAL_UINT <$> getULEB128
    DW_FORM_sdata        -> DW_ATVAL_INT <$> getSLEB128
    DW_FORM_flag         -> DW_ATVAL_BOOL . (/= 0) <$> getWord8
    DW_FORM_string       -> DW_ATVAL_STRING <$> getUTF8Str0
    DW_FORM_ref1         -> DW_ATVAL_REF . inCU cu <$> getWord8
    DW_FORM_ref2         -> DW_ATVAL_REF . inCU cu <$> drGetW16 dr
    DW_FORM_ref4         -> DW_ATVAL_REF . inCU cu <$> drGetW32 dr
    DW_FORM_ref8         -> DW_ATVAL_REF . inCU cu <$> drGetW64 dr
    DW_FORM_ref_udata    -> DW_ATVAL_REF . inCU cu <$> getULEB128
    DW_FORM_ref_addr     -> DW_ATVAL_UINT <$> drGetOffset dr
    DW_FORM_sec_offset   -> DW_ATVAL_UINT <$> drGetOffset dr
    DW_FORM_exprloc      -> DW_ATVAL_BLOB <$> getByteStringLen getULEB128
    DW_FORM_flag_present -> pure $ DW_ATVAL_BOOL True
    DW_FORM_ref_sig8     -> DW_ATVAL_UINT <$> drGetW64 dr
    DW_FORM_indirect     -> getForm cuContext . dw_form =<< getULEB128
    DW_FORM_strp         -> do
      offset <- drGetOffset dr
      pure . DW_ATVAL_STRING .
        getAt getUTF8Str0 offset $ dsStrSection dc

getDIEAndDescendants :: CUContext -> DIECollector Get (Maybe DIE)
getDIEAndDescendants cuContext = do
  offset <- lift $ DieID . fromIntegral <$> Get.bytesRead
  let
    go abbrid = do
      let
        abbrev         = cuAbbrevMap cuContext M.! abbrid
        tag            = abbrevTag abbrev
        (attrs, forms) = unzip $ abbrevAttrForms abbrev
      values          <- lift $ mapM (getForm cuContext) forms
      children <-
        if abbrevChildren abbrev
        then getDieAndSiblings offset cuContext
        else pure []
      pure $ DIE offset tag (zip attrs values) children dr
  traverse go =<< lift getMAbbrevId
  where
    dr = cuReader cuContext

getCUHeader ::
  EndianReader -> Sections ->
  Get (CUOffset, M.Map AbbrevId DW_ABBREV, Reader)
getCUHeader der dwarfSections = do
  cu_offset       <- CUOffset . fromIntegral <$> Get.bytesRead
  (desr, _)       <- getUnitLength der
  _version        <- desrGetW16 desr
  abbrev_offset   <- desrGetOffset desr
  let abbrev_map   = M.fromList . map (abbrevId &&& id) .
                     getAt getAbbrevList abbrev_offset $
                     dsAbbrevSection dwarfSections
  addr_size       <- getWord8
  dr              <- case addr_size of
                      4 -> pure $ reader TargetSize32 desr
                      8 -> pure $ reader TargetSize64 desr
                      _ -> fail $ "Invalid address size: " ++ show addr_size
  return (cu_offset, abbrev_map, dr)

-- TODO: Why not return CUs rather than DIE's?
-- Decode the compilation unit DWARF information entries.
getDieCus :: EndianReader -> Sections -> DIECollector Get [DIE]
getDieCus der dwarfSections =
  withToldRefs Nothing <=<
  whileJust . condAct (lift Get.isEmpty) $ do
    (cu_offset, abbrev_map, dr) <- lift $ getCUHeader der dwarfSections
    maybe (fail "Compilation Unit must have a DIE") return =<<
      getDIEAndDescendants CUContext
        { cuReader = dr
        , cuAbbrevMap = abbrev_map
        , cuOffset = cu_offset
        , cuSections = dwarfSections
        }

-- | Parses the .debug_info section (as ByteString) using the .debug_abbrev and .debug_str sections.
parseInfo :: Endianess -> Sections -> ([DIE], DIEMap)  -- ^ The die list is of compilation unit dies
parseInfo endianess dwarfSections =
  strictGet act $ dsInfoSection dwarfSections
  where
    act = runWriterT $ getDieCus dr dwarfSections
    dr = endianReader endianess