{-# OPTIONS #-} ----------------------------------------------------------------------------- -- | -- Module : Language.Python.Common.StringEscape -- Copyright : (c) 2009 Bernie Pope -- License : BSD-style -- Maintainer : bjpop@csse.unimelb.edu.au -- Stability : experimental -- Portability : ghc -- -- Conversion to/from escaped characters in strings. Note: currently does not -- support escaped Unicode character names. -- -- See: -- -- * Version 2.6 <http://docs.python.org/2.6/reference/lexical_analysis.html#string-literals> -- -- * Version 3.1 <http://docs.python.org/3.1/reference/lexical_analysis.html#string-and-bytes-literals> ----------------------------------------------------------------------------- module Language.Python.Common.StringEscape ( -- * String conversion. unescapeString, unescapeRawString, -- * Digits allowed in octal and hex representation. octalDigits, hexDigits) where import Numeric (readHex, readOct) -- | Convert escaped sequences of characters into /real/ characters in a normal Python string. -- XXX does not handle escaped unicode literals unescapeString :: String -> String unescapeString ('\\':'\\':cs) = '\\' : unescapeString cs -- Backslash (\) unescapeString ('\\':'\'':cs) = '\'' : unescapeString cs -- Single quote (') unescapeString ('\\':'"':cs) = '"' : unescapeString cs -- Double quote (") unescapeString ('\\':'a':cs) = '\a' : unescapeString cs -- ASCII Bell (BEL) unescapeString ('\\':'b':cs) = '\b' : unescapeString cs -- ASCII Backspace (BS) unescapeString ('\\':'f':cs) = '\f' : unescapeString cs -- ASCII Formfeed (FF) unescapeString ('\\':'n':cs) = '\n' : unescapeString cs -- ASCII Linefeed (LF) unescapeString ('\\':'r':cs) = '\r' : unescapeString cs -- ASCII Carriage Return (CR) unescapeString ('\\':'t':cs) = '\t' : unescapeString cs -- ASCII Horizontal Tab (TAB) unescapeString ('\\':'v':cs) = '\v' : unescapeString cs -- ASCII Vertical Tab (VT) unescapeString ('\\':'\n':cs) = unescapeString cs -- line continuation unescapeString ('\\':rest@(o:_)) | o `elem` octalDigits = unescapeNumeric 3 octalDigits (fst . head . readOct) rest unescapeString ('\\':'x':rest@(h:_)) | h `elem` hexDigits = unescapeNumeric 2 hexDigits (fst . head . readHex) rest unescapeString (c:cs) = c : unescapeString cs unescapeString [] = [] {- -- | This function is a placeholder for unescaping characters in raw strings. -- The Python documentation explicitly says that -- "When an 'r' or 'R' prefix is present, a character following a backslash is included -- in the string without change, and all backslashes are left in the string." -- However it also says that When an 'r' or 'R' prefix is used in conjunction with -- a 'u' or 'U' prefix, then the \uXXXX and \UXXXXXXXX escape sequences are processed -- while all other backslashes are left in the string. Currently the function is the identity -- but it ought to process unicode escape sequences. -} -- XXX does not handle escaped unicode literals unescapeRawString :: String -> String unescapeRawString = id {- -- | Convert escaped sequences of characters into /real/ characters in a raw Python string. -- Note: despite their name, Python raw strings do allow a small set of character escapings, -- namely the single and double quote characters and the line continuation marker. unescapeRawString ('\\':'\'':cs) = '\'' : unescapeRawString cs -- Single quote (') unescapeRawString ('\\':'"':cs) = '"' : unescapeRawString cs -- Double quote (") unescapeRawString ('\\':'\n':cs) = unescapeRawString cs -- line continuation unescapeRawString (c:cs) = c : unescapeRawString cs unescapeRawString [] = [] -} {- This is a bit complicated because Python allows between 1 and 3 octal characters after the \, and 1 and 2 hex characters after a \x. -} unescapeNumeric :: Int -> String -> (String -> Int) -> String -> String unescapeNumeric n numericDigits readNumeric str = loop n [] str where loop _ acc [] = [numericToChar acc] loop 0 acc rest = numericToChar acc : unescapeString rest loop n acc (c:cs) | c `elem` numericDigits = loop (n-1) (c:acc) cs | otherwise = numericToChar acc : unescapeString (c:cs) numericToChar :: String -> Char numericToChar = toEnum . readNumeric . reverse octalDigits, hexDigits :: String -- | The set of valid octal digits in Python. octalDigits = "01234567" -- | The set of valid hex digits in Python. hexDigits = "0123456789abcdef"