{-# LANGUAGE BangPatterns               #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE OverloadedStrings          #-}
{-# LANGUAGE PatternGuards              #-}
{-# LANGUAGE ScopedTypeVariables        #-}
{-# LANGUAGE ViewPatterns               #-}

{-# OPTIONS_GHC -funbox-strict-fields #-}

module FastTags
    ( isHsFile
    , isLiterateFile
    , merge
    , TokenVal(..)
    , TagVal(..)
    , Type(..)
    , Tag(..)
    , Pos(..)
    , SrcPos(..)
    , UnstrippedTokens(..)
    , breakString
    , stripComments
    , processFile
    , processAll
    , process
    , tokenize
    , stripCpp
    , annotate
    , stripNewlines
    , breakBlocks
    , unstrippedTokensOf
    , split
    )
where

import Control.Arrow ((***), (&&&))
import Control.Monad
import Control.DeepSeq (NFData, rnf)
import Data.Function (on)
import Data.Functor ((<$>))
import Data.Monoid (Monoid, (<>), mconcat)
import Data.Text (Text)
import qualified System.Exit as Exit

import qualified Language.Preprocessor.Unlit as Unlit
import Text.Printf (printf)

import qualified Control.Exception as Exception
import qualified Data.Char as Char
import qualified Data.IntSet as IntSet
import qualified Data.List as L
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified System.IO as IO


-- * types

data TagVal = TagVal
    !Text -- ^ prefix
    !Text -- ^ name
    !Type -- ^ tag type
    deriving (Show)

instance NFData TagVal where
    rnf (TagVal x y z) = rnf x `seq` rnf y `seq` rnf z

instance Eq TagVal where
    TagVal _ name t == TagVal _ name' t' = name == name' && t == t'

instance Ord TagVal where
    compare (TagVal _ name t) (TagVal _ name' t') =
        name `compare` name' <> t `compare` t'

-- don't swap constructors since we rely that Type < Constructor == True holds
data Type =
    Function
    | Type
    | Constructor
    | Class
    | Module
    | Operator
    | Pattern
    deriving (Eq, Ord, Show)

instance NFData Type where
    rnf t = t `seq` ()

data TokenVal =
    Token !Text !Text
    | Newline !Int -- ^ indentation
    deriving (Show)

tokenName :: TokenVal -> Text
tokenName (Token _ name) = name
tokenName _              = error "cannot extract name from non-Token TokenVal"

data Tag =
    Tag !(Pos TagVal)
    | RepeatableTag !(Pos TagVal)
    | Warning !String
    deriving (Show, Eq, Ord)

partitionTags :: [Tag] -> ([Pos TagVal], [Pos TagVal], [String])
partitionTags ts = go ts [] [] []
    where
    go []                     xs ys zs = (xs, ys, reverse zs)
    go (Tag t : ts)           xs ys zs = go ts (t:xs) ys     zs
    go (RepeatableTag t : ts) xs ys zs = go ts xs     (t:ys) zs
    go (Warning warn : ts)    xs ys zs = go ts xs     ys     (warn:zs)

type Token = Pos TokenVal

-- | Newlines have to remain in the tokens because 'breakBlocks' relies on
-- them.  But they make pattern matching on the tokens unreliable because
-- newlines might be anywhere.  A newtype makes sure that the tokens only get
-- stripped once and that I don't do any pattern matching on unstripped tokens.
newtype UnstrippedTokens = UnstrippedTokens [Token]
    deriving (Show, Monoid)

mapTokens :: ([Token] -> [Token]) -> UnstrippedTokens -> UnstrippedTokens
mapTokens f (UnstrippedTokens tokens) = UnstrippedTokens (f tokens)

unstrippedTokensOf :: UnstrippedTokens -> [Token]
unstrippedTokensOf (UnstrippedTokens tokens) = tokens

-- | Drop @n@ non-newline tokens.
dropTokens :: Int -> UnstrippedTokens -> UnstrippedTokens
dropTokens n = mapTokens (f n)
    where
    f :: Int -> [Token] -> [Token]
    f 0 xs                       = xs
    f _ []                       = []
    f n (Pos _ (Newline _) : xs) = f n xs
    f n (Pos _ (Token _ _) : xs) = f (n - 1) xs

type Line = Pos Text

data Pos a = Pos {
    posOf :: !SrcPos
    , valOf :: !a
    } deriving (Eq, Ord)

instance (NFData a) => NFData (Pos a) where
    rnf (Pos x y) = rnf x `seq` rnf y

data SrcPos = SrcPos {
    _posFile :: !FilePath
    , posLine  :: !Int
    } deriving (Eq, Ord)

instance NFData SrcPos where
    rnf (SrcPos x y) = rnf x `seq` rnf y

instance Show a => Show (Pos a) where
    show (Pos pos val) = show pos ++ ":" ++ show val
instance Show SrcPos where
    show (SrcPos fn line) = fn ++ ":" ++ show line

-- * process

-- | Global processing for when all tags are together.
processAll :: [[Pos TagVal]] -> [Pos TagVal]
processAll =
    sortDups . dropDups isDuplicatePair
        . combineBalanced (mergeOn tagSortingKey)
        . map (dropDups isDuplicatePair)
    where
    isDuplicatePair :: Pos TagVal -> Pos TagVal -> Bool
    isDuplicatePair t t' =
        posOf t == posOf t'
        && tagText t == tagText t'
        && tagType t == tagType t'

combineBalanced :: forall a. (a -> a -> a) -> [a] -> a
combineBalanced f xs = go xs
    where
    go :: [a] -> a
    go [] = error "cannot combine empty list"
    go xs@(_:_) = case combine xs of
        []  -> error "unexpected empty list when combining nonempty lists"
        [x] -> x
        xs' -> go xs'
    combine :: [a] -> [a]
    combine []        = []
    combine [x]       = [x]
    combine (x:x':xs) = f x x' : combine xs

-- | Given multiple matches, vim will jump to the first one.  So sort adjacent
-- tags with the same text by their type.
--
-- Mostly this is so that given a type with the same name as its module,
-- the type will come first.
sortDups :: [Pos TagVal] -> [Pos TagVal]
sortDups = concatMap (sortOn tagType) .  M.elems . M.fromAscListWith (++)
    . map (fst . tagSortingKey &&& (:[]))

tagText :: Pos TagVal -> Text
tagText (Pos _ (TagVal _ text _)) = text

tagType :: Pos TagVal -> Type
tagType (Pos _ (TagVal _ _ t)) = t

tagLine :: Pos TagVal -> Int
tagLine (Pos (SrcPos _ line) _) = line

-- | Read tags from one file.
processFile :: Bool -> FilePath -> Bool -> IO ([Pos TagVal], [String])
processFile ignoreEncodingErrors fn trackPrefixes =
    fmap (process fn trackPrefixes) (T.readFile fn)
    `Exception.catch` \(exc :: Exception.SomeException) -> do
        -- readFile will crash on files that are not UTF8.  Unfortunately not
        -- all haskell source file are.
        IO.hPutStrLn IO.stderr $
            "exception reading " ++ show fn ++ ": " ++ show exc
        unless ignoreEncodingErrors $
            void $ Exit.exitFailure
        return ([], [])

tagSortingKey :: Pos TagVal -> (Text, Type)
tagSortingKey (Pos _ (TagVal _ name t)) = (name, t)

-- | Process one file's worth of tags.
process :: FilePath -> Bool -> Text -> ([Pos TagVal], [String])
process fn trackPrefixes =
    splitAndRemoveRepeats . concatMap blockTags . breakBlocks . stripComments
        . mconcat . map (tokenize trackPrefixes) . stripCpp . annotate fn
        . unlit'
    where
    splitAndRemoveRepeats :: [Tag] -> ([Pos TagVal], [String])
    splitAndRemoveRepeats tags =
        (mergeOn tagSortingKey (sortOn tagSortingKey newTags) earliestRepeats,
            warnings)
        where
        (newTags, repeatableTags, warnings) = partitionTags tags
        earliestRepeats :: [Pos TagVal]
        earliestRepeats = M.elems $ M.fromListWith minLine $
            map (tagSortingKey &&& id) repeatableTags
        minLine x y
            | tagLine x < tagLine y = x
            | otherwise             = y
    unlit' :: Text -> Text
    unlit' s
        | isLiterateFile fn = T.pack $ Unlit.unlit fn $ T.unpack s'
        | otherwise = s
        where
        s' :: Text
        s'  | "\\begin{code}" `T.isInfixOf` s && "\\end{code}" `T.isInfixOf` s =
                T.unlines $ filter (not . birdLiterateLine) $ T.lines s
            | otherwise = s
        birdLiterateLine :: Text -> Bool
        birdLiterateLine xs
            | T.null xs = False
            | otherwise = case headt $ T.dropWhile Char.isSpace xs of
                Just '>' -> True
                _ -> False

-- * tokenize

annotate :: FilePath -> Text -> [Line]
annotate fn text =
    [Pos (SrcPos fn num) line | (num, line) <- zip [1..] (T.lines text)]

-- | Also strips out hsc detritus.
stripCpp :: [Line] -> [Line]
stripCpp = filter $ not . ("#" `T.isPrefixOf`) . valOf

tokenize :: Bool -> Line -> UnstrippedTokens
tokenize trackPrefixes (Pos pos line) =
  UnstrippedTokens $ map (Pos pos) (tokenizeLine trackPrefixes line)

spanToken :: Text -> (Text, Text)
spanToken text
    | T.null text = ("", "")
    -- Special case to prevent "--" from consuming too much input so that
    -- closing "-}" becomes unavailable.
    | Just rest <- T.stripPrefix "--}" text = ("-}", rest)
    | Just (sym, rest) <- consume comments text = (sym, rest)
    -- Find symbol that isn't followed by haskellOpChar.
    | Just (sym, rest) <- consume symbols text
            , maybe True (not . haskellOpChar) (headt rest)
        = (sym, rest)
    | c == '\'' = let (token, rest) = breakChar cs in (T.cons c token, rest)
    | c == '"' =
        let (token, rest) = breakString cs in (T.cons c token, rest)
    | state@(token, _) <- spanSymbol (haskellOpChar c) text,
        not (T.null token) = state
    -- This will tokenize differently than haskell should, e.g., 9x will
    -- be "9x" not "9" "x".  But I just need a wordlike chunk, not an
    -- actual token.  Otherwise I'd have to tokenize numbers.
    | otherwise = case T.span (identChar $ Char.isUpper c) text of
        ("", _)       -> (T.singleton c, cs)
        (token, rest) -> (token, rest)
    where
    -- Safe because of null check above.
    Just (c, cs) = T.uncons text
    comments = ["{-", "-}"]
    symbols = ["--", "=>", "->", "::"]

tokenizeLine :: Bool -> Text -> [TokenVal]
tokenizeLine trackPrefixes text = Newline nspaces : go spaces line
    where
    nspaces = fromIntegral $ T.count " " spaces + T.count "\t" spaces * 8
    (spaces, line) = T.break (not . Char.isSpace) text
    go :: Text -> Text -> [TokenVal]
    go oldPrefix unstripped
        | T.null stripped = []
        | otherwise       =
            let (token, rest) = spanToken stripped
                newPrefix     = if trackPrefixes
                                then oldPrefix <> spaces <> token
                                else T.empty
            in Token newPrefix token : go newPrefix rest
        where (spaces, stripped) = T.break (not . Char.isSpace) unstripped

startIdentChar :: Char -> Bool
startIdentChar c = Char.isAlpha c || c == '_'

identChar :: Bool -> Char -> Bool
identChar considerDot c = Char.isAlphaNum c || c == '\'' || c == '_' || c == '#'
    || considerDot && c == '.'

-- unicode operators are not supported yet
haskellOpChar :: Char -> Bool
haskellOpChar c = IntSet.member (Char.ord c) opChars
    where
    opChars :: IntSet.IntSet
    opChars = IntSet.fromList $ map Char.ord "-!#$%&*+./<=>?@^|~:\\"

isTypeVarStart :: Text -> Bool
isTypeVarStart x = case headt x of
    Just c -> Char.isLower c || c == '_'
    _ -> False

-- | Span a symbol, making sure to not eat comments.
spanSymbol :: Bool -> Text -> (Text, Text)
spanSymbol considerColon text
    | Just res <- haskellOp text [] = res
    | any (`T.isPrefixOf` post) [",", "--", "-}", "{-"] = split
    | Just (c, cs) <- T.uncons post, c == '-' || c == '{' =
        let (pre2, post2) = spanSymbol considerColon cs
        in (pre <> T.cons c pre2, post2)
    | otherwise = split
    where
    split@(pre, post) = T.break
        (\c -> T.any (==c) "-{," || not (symbolChar considerColon c)) text

    haskellOp :: Text -> [Char] -> Maybe (Text, Text)
    haskellOp txt op
      | Just (c, cs) <- T.uncons txt
      , haskellOpChar c
      , not $ "-}" `T.isPrefixOf` txt =
          haskellOp cs $ c : op
      | null op   = Nothing
      | otherwise = Just (T.pack $ reverse op, txt)

symbolChar :: Bool -> Char -> Bool
symbolChar considerColon c =
    (Char.isSymbol c || Char.isPunctuation c)
    && (not (c `elem` ("(),;[]`{}_:\"'"::String)) || considerColon && c == ':')

breakChar :: Text -> (Text, Text)
breakChar text = case headt text of
    Nothing -> ("", "")
    Just '\\' -> T.splitAt 3 text
    _ -> T.splitAt 2 text

-- TODO \ continuation isn't supported.  I'd have to tokenize at the file
-- level instead of the line level.
breakString :: Text -> (Text, Text)
breakString = (T.pack . reverse *** T.pack) . go [] . T.unpack
    where
    go :: String -> String -> (String, String)
    go s []             = (s, [])
    go s ('"':xs)       = ('"': s, xs)
    go s ('\\':[])      = (s, "\\")
    -- handle string continuation
    go s ('\\':'\n':xs) = go s $ dropBackslash $
        dropWhile (\c -> c /= '\\' && c /= '"') xs
    go s ('\\':x:xs)    = go (x : '\\': s) xs
    go s (x:xs)         = go (x : s) xs

    dropBackslash :: String -> String
    dropBackslash ('\\':xs) = xs
    dropBackslash xs        = xs

stripComments :: UnstrippedTokens -> UnstrippedTokens
stripComments = mapTokens (go 0)
    where
    go :: Int -> [Token] -> [Token]
    go _ [] = []
    go nest (pos@(Pos _ token) : rest)
        | token `nameStartsWith` "{-"                     = go (nest + 1) rest
        | token `nameEndsWith` "-}"                       = go (nest - 1) rest
        | nest == 0 && tokenNameSatisfies token isComment =
            go nest (dropLine rest)
        | nest > 0                                        = go nest rest
        | otherwise                                       = pos : go nest rest
    dropLine :: [Token] -> [Token]
    dropLine = dropWhile (not . isNewline)
    isComment :: Text -> Bool
    isComment name =
        "--" `T.isPrefixOf` name
        && T.all (\c -> not (haskellOpChar c) || c == '-') (T.drop 2 name)

-- | Break the input up into blocks based on indentation.
breakBlocks :: UnstrippedTokens -> [UnstrippedTokens]
breakBlocks =
    map UnstrippedTokens . filter (not . null)
        . go . filterBlank . unstrippedTokensOf
    where
    go :: [Token] -> [[Token]]
    go []     = []
    go tokens = pre : go post
        where (pre, post) = breakBlock tokens
    -- Blank lines mess up the indentation.
    filterBlank :: [Token] -> [Token]
    filterBlank [] = []
    filterBlank (Pos _ (Newline _) : xs@(Pos _ (Newline _) : _)) =
        filterBlank xs
    filterBlank (x:xs) = x : filterBlank xs

-- | Take until a newline, then take lines until the indent established after
-- that newline decreases. Or, alternatively, if "{" is encountered then count
-- it as a block until closing "}" is found taking nesting into account.
breakBlock :: [Token] -> ([Token], [Token])
breakBlock (t@(Pos _ tok) : ts) = case tok of
    Newline indent -> collectIndented indent ts
    Token _ "{"    -> collectBracedBlock breakBlock ts 1
    _              -> remember t $ breakBlock ts
    where
    collectIndented :: Int -> [Token] -> ([Token], [Token])
    collectIndented indent tsFull@(t@(Pos _ tok) : ts) = case tok of
        Newline n | n <= indent -> ([], tsFull)
        Token _ "{" ->
            remember t $ collectBracedBlock (collectIndented indent) ts 1
        _           -> remember t $ collectIndented indent ts
    collectIndented _ [] = ([], [])

    collectBracedBlock :: ([Token] -> ([Token], [Token])) -> [Token] -> Int
        -> ([Token], [Token])
    collectBracedBlock _    []                           _ = ([], [])
    collectBracedBlock cont ts                           0 = cont ts
    collectBracedBlock cont (t@(Pos _ (Token _ "{")) : ts) n =
      remember t $ collectBracedBlock cont ts $! n + 1
    collectBracedBlock cont (t@(Pos _ (Token _ "}")) : ts) n =
      remember t $ collectBracedBlock cont ts $! n - 1
    collectBracedBlock cont (t:ts)                       n =
      remember t $ collectBracedBlock cont ts n

    remember :: Token -> ([Token], [Token]) -> ([Token], [Token])
    remember t (xs, ys) = (t : xs, ys)
breakBlock [] = ([], [])

-- * extract tags

-- | Get all the tags in one indented block.
-- TODO clean this up to require less nesting, and dropDataContext duplication
blockTags :: UnstrippedTokens -> [Tag]
blockTags unstripped = case stripNewlines unstripped of
    [] -> []
    Pos _ (Token _ "module") : Pos pos (Token prefix name) : _ ->
        [mkTag pos prefix (snd (T.breakOnEnd "." name)) Module]
    Pos _ (Token _ "pattern") : Pos pos (Token prefix name) : _
        | maybe False Char.isUpper (headt name) ->
            [mkTag pos prefix name Pattern]
    Pos _ (Token _ "foreign") : decl -> foreignTags decl
    -- newtype instance * = ...
    Pos _ (Token _ "newtype") : Pos _ (Token _ "instance")
            : (dropDataContext -> Pos pos _: rest) ->
        newtypeTags pos rest
    -- newtype X * = X *
    Pos _ (Token _ "newtype")
            : (dropDataContext -> whole@(tok@(Pos pos (Token _ name)) : rest))
        | isTypeName name -> tokToTag tok Type : newtypeTags pos rest
        | otherwise -> tok' : newtypeTags pos' rest'
        where (pos', tok', rest') = recordInfixName Type whole
    -- type family X ...
    Pos _ (Token _ "type") : Pos _ (Token _ "family")
            : (dropDataContext -> whole@(tok@(Pos _ (Token _ name)) : _))
        | isTypeFamilyName name -> [tokToTag tok Type]
        | otherwise -> [tok']
        where (_, tok', _) = recordInfixName Type whole
    -- type X * = ...
    Pos _ (Token _ "type")
            : (dropDataContext -> whole@(tok@(Pos _ (Token _ name)) : _))
        | isTypeName name -> [tokToTag tok Type]
        | otherwise -> [tok']
        where (_, tok', _) = recordInfixName Type whole
    -- data family X ...
    Pos _ (Token _ "data") : Pos _ (Token _ "family")
            : (dropDataContext -> tok@(Pos _ (Token _ name)) : rest)
        | isTypeFamilyName name -> [tokToTag tok Type]
        | otherwise -> [tok']
        where (_, tok', _) = recordInfixName Type rest
    -- data instance * = ...
    -- data instance * where ...
    Pos _ (Token _ "data") : Pos _ (Token _ "instance")
            : (dropDataContext -> Pos pos _: _) ->
        dataConstructorTags pos (dropTokens 2 unstripped)
    -- data X * = X { X :: *, X :: * }
    -- data X * where ...
    Pos _ (Token _ "data")
            : (dropDataContext -> tok@(Pos pos (Token _ name)) : rest)
        | isTypeName name -> tokToTag tok Type
            : dataConstructorTags pos (dropTokens 2 unstripped)
      -- if token after data is not a type name then it isn't
      -- infix type as well since it may be only '(' or some
      -- lowercase name, either of which is not type constructor
        | otherwise -> let (pos', tok, _) = recordInfixName Type rest
           in tok : dataConstructorTags pos' (dropTokens 1 unstripped)
    -- class * => X where X :: * ...
    Pos pos (Token _ "class") : _ -> classTags pos (dropTokens 1 unstripped)

    Pos _ (Token _ "infix") : _ -> []
    Pos _ (Token _ "infixl") : _ -> []
    Pos _ (Token _ "infixr") : _ -> []
    -- instance * where data * = X :: * ...
    Pos pos (Token _ "instance") : _ ->
        instanceTags pos (dropTokens 1 unstripped)
    -- x, y, z :: *
    stripped -> toplevelFunctionTags stripped

isTypeFamilyName :: Text -> Bool
isTypeFamilyName = maybe False (\c -> Char.isUpper c || haskellOpChar c) . headt

isTypeName  :: Text -> Bool
isTypeName x = case headt x of
    Just c -> Char.isUpper c || c == ':'
    _ -> False

dropDataContext :: [Token] -> [Token]
dropDataContext = stripParensKindsTypeVars . stripOptContext

recordInfixName :: Type -> [Token] -> (SrcPos, Tag, [Token])
recordInfixName tokenType tokens = (pos, tokToTag tok tokenType, rest)
    where (tok@(Pos pos _) : rest) = dropInfixTypeStart tokens

-- same as dropWhile with counting
dropInfixTypeStart :: [Token] -> [Token]
dropInfixTypeStart tokens = dropWhile f tokens
    where
    f (Pos _ (Token _ name)) = isInfixTypePrefix name || name == "`"
    f _                      = False

    isInfixTypePrefix :: Text -> Bool
    isInfixTypePrefix =
        maybe False ((\c -> Char.isLower c || c == '(')) . headt

-- | It's easier to scan for tokens without pesky newlines popping up
-- everywhere.  But I need to keep the newlines in in case I hit a @where@
-- and need to call 'breakBlocks' again.
stripNewlines :: UnstrippedTokens -> [Token]
stripNewlines = filter (not . isNewline) . (\(UnstrippedTokens t) -> t)

-- | Tags from foreign import.
--
-- e.g. @foreign import ccall safe \"name\" c_name :: ...@ will produce a tag
-- for @c_name@.
foreignTags :: [Token] -> [Tag]
foreignTags decl = case decl of
    Pos _ (Token _ "import") : decl'
        | name : _ <- dropBefore ((=="::") . tokenName . valOf) decl' ->
            [tokToTag name Pattern]
    _ -> []

toplevelFunctionTags :: [Token] -> [Tag]
toplevelFunctionTags toks = case tags of
    -- Tags of toplevel functions are all repeatable, even the ones that come
    -- from the type signature because there will definitely be tags from the
    -- body and they should be sorted out if type signature is present.
    [] -> functionTagsNoSig toks
    _  -> map toRepeatableTag $ tags
    where
    -- first try to detect tags from type signature, if it fails then
    -- do the actual work of detecting from body
    (tags, _) = functionTags False toks
    toRepeatableTag :: Tag -> Tag
    toRepeatableTag (Tag t) = RepeatableTag t
    toRepeatableTag t       = t

functionTagsNoSig :: [Token] -> [Tag]
functionTagsNoSig toks = go toks
    where
    go :: [Token] -> [Tag]
    go []                           = []
    go toks@(Pos _ (Token _ "(") : _) = go $ stripBalancedParens toks
    go (Pos pos (Token prefix name) : ts)
        -- this function does not analyze type signatures
        | name == "::" = []
        | name == "!" || name == "~" || name == "@" = go ts
        | name == "=" || name == "|" = case stripParens toks of
            Pos pos (Token prefix name) : _
                | functionName False name  ->
                    [mkRepeatableTag pos prefix name Function]
                | T.all haskellOpChar name ->
                    [mkRepeatableTag pos prefix name Operator]
            _ -> []
        | name == "`" = case ts of
            Pos pos' (Token prefix' name') : _
                | functionName False name' ->
                    [mkRepeatableTag pos' prefix' name' Function]
            _ -> go ts
      | T.all haskellOpChar name   = [mkRepeatableTag pos prefix name Operator]
      | otherwise                  = go ts
    stripParens :: [Token] -> [Token]
    stripParens = dropWhile ((`hasName` "(") . valOf)

-- | Get tags from a function type declaration: token , token , token ::
-- Return the tokens left over.
functionTags :: Bool -- ^ expect constructors, not functions
    -> [Token] -> ([Tag], [Token])
functionTags constructors = go []
    where
    opTag   = if constructors then Constructor else Operator
    funcTag = if constructors then Constructor else Function
    go :: [Tag] -> [Token] -> ([Tag], [Token])
    go tags (Pos _ (Token _ "(") : Pos pos (Token _ name)
            : Pos _ (Token prefix ")") : Pos _ (Token _ "::") : rest) =
        (reverse $ mkTag pos prefix name opTag : tags, rest)
    go tags (Pos pos (Token prefix name) : Pos _ (Token _ "::") : rest)
        | functionName constructors name =
            (reverse $ mkTag pos prefix name funcTag : tags, rest)
    go tags (Pos _ (Token _ "(") : Pos pos (Token _ name)
            : Pos _ (Token prefix ")") : Pos _ (Token _ ",") : rest) =
        go (mkTag pos prefix name opTag : tags) rest
    go tags (Pos pos (Token prefix name) : Pos _ (Token _ ",") : rest)
        | functionName constructors name =
            go (mkTag pos prefix name funcTag : tags) rest
    go tags tokens = (tags, tokens)

functionName :: Bool -> Text -> Bool
functionName constructors text = isFunction text
    where
    isFunction text = case T.uncons text of
        Just (c, cs) ->
            firstChar c && startIdentChar c && T.all (identChar True) cs
        Nothing      -> False
    firstChar = if constructors
                then Char.isUpper
                else \c -> Char.isLower c || c == '_'

-- | * = X *
newtypeTags :: SrcPos -> [Token] -> [Tag]
newtypeTags prevPos tokens = case dropUntil "=" tokens of
    Pos pos (Token prefix name) : rest ->
        let constructor = mkTag pos prefix name Constructor
        in  case rest of
            Pos _ (Token _ "{") : Pos funcPos (Token funcPrefix funcName) : _ ->
                [constructor, mkTag funcPos funcPrefix funcName Function]
            _ ->
                [constructor]
    rest -> unexpected prevPos (UnstrippedTokens tokens) rest "newtype * ="

-- | [] (empty data declaration)
-- * = X { X :: *, X :: * }
-- * where X :: * X :: *
-- * = X | X
dataConstructorTags :: SrcPos -> UnstrippedTokens -> [Tag]
dataConstructorTags prevPos unstripped
    -- GADT
    | any ((`hasName` "where") . valOf) (unstrippedTokensOf unstripped) =
        concatMap gadtTags (whereBlock unstripped)
    -- plain ADT
    | otherwise = case strip unstripped of
        [] -> [] -- empty data declaration
        rest | Just (Pos pos (Token prefix name), rest') <-
                extractInfixConstructor rest ->
            mkTag pos prefix name Constructor : collectRest rest'
        Pos pos (Token prefix name) : rest ->
            mkTag pos prefix name Constructor : collectRest rest
        rest -> unexpected prevPos unstripped rest "data * ="
    where
    strip = stripOptBang . stripOptContext . stripOptForall . dropUntil "="
         . stripNewlines
    collectRest :: [Token] -> [Tag]
    collectRest tokens
        | (tags@(_:_), rest) <- functionTags False tokens =
            tags ++ collectRest (dropUntilNextField rest)
    collectRest (Pos pipePos (Token _ "|") : rest)
        | Just (Pos pos (Token prefix name), rest'') <-
                extractInfixConstructor rest' =
            mkTag pos prefix name Constructor : collectRest rest''
        | Pos pos (Token prefix name) : rest'' <- rest' =
            mkTag pos prefix name Constructor
                : collectRest (dropUntilNextCaseOrRecordStart rest'')
        | otherwise = error $
            printf "syntax error@%d: | not followed by tokens\n"
                (posLine pipePos)
        where
        rest' = stripOptBang $ stripOptContext $ stripOptForall rest
    collectRest (_ : rest) = collectRest rest
    collectRest [] = []

    stripOptBang :: [Token] -> [Token]
    stripOptBang ((Pos _ (Token _ "!")) : rest) = rest
    stripOptBang ts = ts

    extractInfixConstructor :: [Token] -> Maybe (Token, [Token])
    extractInfixConstructor = extract . stripTypeParam
        where
        extract :: [Token] -> Maybe (Token, [Token])
        extract (tok@(Pos _ (Token _ name)) : rest)
            | ":" `T.isPrefixOf` name = Just (tok, stripTypeParam rest)
        extract (Pos _ (Token _ "`") : tok@(Pos _ _) : Pos _ (Token _ "`")
                : rest) =
            Just (tok, stripTypeParam rest)
        extract _ = Nothing
        stripTypeParam :: [Token] -> [Token]
        stripTypeParam input@((Pos _ (Token _ "(")) : _) =
            stripBalancedParens input
        stripTypeParam input@((Pos _ (Token _ "[")) : _) =
            stripBalancedBrackets input
        stripTypeParam ts = drop 1 ts

    dropUntilNextCaseOrRecordStart :: [Token] -> [Token]
    dropUntilNextCaseOrRecordStart =
        dropWithStrippingBalanced (\x -> not $ x == "|" || x == "{")

    dropUntilNextField :: [Token] -> [Token]
    dropUntilNextField =
        dropWithStrippingBalanced (\x -> not $ x == "," || x == "}" || x == "|")

stripOptForall :: [Token] -> [Token]
stripOptForall (Pos _ (Token _ "forall") : rest) = dropUntil "." rest
stripOptForall xs                               = xs

stripParensKindsTypeVars :: [Token] -> [Token]
stripParensKindsTypeVars (Pos _ (Token _ "(") : xs)  =
    stripParensKindsTypeVars xs
stripParensKindsTypeVars (Pos _ (Token _ "::") : xs) =
    stripParensKindsTypeVars $ tail $ dropWithStrippingBalanced (/= ")") xs
stripParensKindsTypeVars (Pos _ (Token _ name) : xs)
    | isTypeVarStart name = stripParensKindsTypeVars xs
stripParensKindsTypeVars xs = xs

stripOptContext :: [Token] -> [Token]
stripOptContext (stripBalancedParens -> (Pos _ (Token _ "=>") : xs)) = xs
stripOptContext (stripSingleClassContext -> Pos _ (Token _ "=>") : xs) = xs
stripOptContext xs = xs

stripSingleClassContext :: [Token] -> [Token]
stripSingleClassContext (Pos _ (Token _ name) : xs)
    | maybe False Char.isUpper (headt name) =
        dropWithStrippingBalanced isTypeVarStart xs
stripSingleClassContext xs = xs

-- | Drop all tokens for which @pred@ returns True, also drop
-- any parenthesized expressions.
dropWithStrippingBalanced :: (Text -> Bool) -> [Token] -> [Token]
dropWithStrippingBalanced pred input@(Pos _ (Token _ name) : xs)
    | name == "(" = dropWithStrippingBalanced pred $ stripBalancedParens input
    | name == "[" = dropWithStrippingBalanced pred $ stripBalancedBrackets input
    | pred name   = dropWithStrippingBalanced pred xs
dropWithStrippingBalanced _ xs = xs

stripBalancedParens :: [Token] -> [Token]
stripBalancedParens = stripBalanced "(" ")"

stripBalancedBrackets :: [Token] -> [Token]
stripBalancedBrackets = stripBalanced "[" "]"

stripBalanced :: Text -> Text -> [Token] -> [Token]
stripBalanced open close (Pos _ (Token _ name) : xs)
    | name == open = go 1 xs
    where
    go :: Int -> [Token] -> [Token]
    go 0 xs = xs
    go !n (Pos _ (Token _ name) : xs)
        | name == open  = go (n + 1) xs
        | name == close = go (n - 1) xs
        | otherwise     = go n       xs
    go n (_: xs)                     = go n xs
    go _ []                          = []
stripBalanced _ _ xs = xs

gadtTags :: UnstrippedTokens -> [Tag]
gadtTags = fst . functionTags True . stripNewlines

-- | * => X where X :: * ...
classTags :: SrcPos -> UnstrippedTokens -> [Tag]
classTags prevPos unstripped =
    case dropDataContext $ stripNewlines unstripped of
        whole@(tok@(Pos _ (Token _ name)) : _)
          -- Drop the where and start expecting functions.
            | isTypeName name -> tokToTag tok Class : cont
            | otherwise ->
                let (_, tok, _) = recordInfixName Class whole
                in tok : concatMap classBodyTags (whereBlock unstripped)
            where cont = concatMap classBodyTags (whereBlock unstripped)
        rest -> unexpected prevPos unstripped rest "class * =>"

classBodyTags :: UnstrippedTokens -> [Tag]
classBodyTags unstripped = case stripNewlines unstripped of
    Pos _ (Token _ typedata) : Pos pos (Token prefix name) : _
        | typedata `elem` ["type", "data"] -> [mkTag pos prefix name Type]
    tokens -> fst $ functionTags False tokens

-- | Skip to the where and split the indented block below it.
whereBlock :: UnstrippedTokens -> [UnstrippedTokens]
whereBlock = breakBlocks . mapTokens (dropUntil "where")

instanceTags :: SrcPos -> UnstrippedTokens -> [Tag]
instanceTags prevPos unstripped =
    -- instances can offer nothing but some fresh data constructors since
    -- the actual datatype is really declared in the class declaration
    concatMap (newtypeTags prevPos . unstrippedTokensOf)
        (filter isNewtypeDecl block)
    ++ concatMap (dataConstructorTags prevPos)
        (filter isDataDecl block)
    where
    block = whereBlock unstripped

    isNewtypeDecl :: UnstrippedTokens -> Bool
    isNewtypeDecl (UnstrippedTokens (Pos _ (Token _ "newtype") : _)) = True
    isNewtypeDecl _ = False

    isDataDecl :: UnstrippedTokens -> Bool
    isDataDecl (UnstrippedTokens (Pos _ (Token _ "data") : _)) = True
    isDataDecl _ = False

-- * util

mkTag :: SrcPos -> Text -> Text -> Type -> Tag
mkTag pos prefix name typ = Tag $ Pos pos (TagVal prefix name typ)

mkRepeatableTag :: SrcPos -> Text -> Text -> Type -> Tag
mkRepeatableTag pos prefix name typ =
    RepeatableTag $ Pos pos (TagVal prefix name typ)

tokToTag :: Token -> Type -> Tag
tokToTag (Pos pos (Token prefix name)) t = mkTag pos prefix name t

warning :: SrcPos -> String -> Tag
warning pos warn = Warning $ show pos ++ ": " ++ warn

unexpected :: SrcPos -> UnstrippedTokens -> [Token] -> String -> [Tag]
unexpected prevPos (UnstrippedTokens tokensBefore) tokensHere declaration =
    [warning pos ("unexpected " ++ thing ++ " after " ++ declaration)]
    where
    thing = maybe "end of block" (show . valOf) (mhead tokensHere)
    pos
        | Just t <- mhead tokensHere = posOf t
        | Just t <- mlast tokensBefore = posOf t
        | otherwise = prevPos

isNewline :: Token -> Bool
isNewline (Pos _ (Newline _)) = True
isNewline _                   = False

hasName :: TokenVal -> Text -> Bool
hasName tok text = tokenNameSatisfies tok (== text)

nameStartsWith :: TokenVal -> Text -> Bool
nameStartsWith tok text = tokenNameSatisfies tok (text `T.isPrefixOf`)

nameEndsWith :: TokenVal -> Text -> Bool
nameEndsWith tok text = tokenNameSatisfies tok (text `T.isSuffixOf`)

tokenNameSatisfies :: TokenVal -> (Text -> Bool) -> Bool
tokenNameSatisfies (Token _ name) pred = pred name
tokenNameSatisfies _              _    = False

-- * generic utils

-- | Try to match one of the given prefixes.
consume :: [Text] -> Text -> Maybe (Text, Text) -- ^ (prefix, remainder)
consume prefixes t =
    msum [(,) prefix <$> T.stripPrefix prefix t | prefix <- prefixes]

-- | Drop until the element before the matching one.  Return [] if the function
-- never matches.
dropBefore :: (a -> Bool) -> [a] -> [a]
dropBefore f = go
    where
    go [] = []
    go [_] = []
    go xs@(_ : rest@(y:_))
        | f y = xs
        | otherwise = go rest

dropDups :: (a -> a -> Bool) -> [a] -> [a]
dropDups cmp (x:xs) = go x xs
    where
    go a [] = [a]
    go a (b:bs)
        | cmp a b   = go a bs
        | otherwise = a : go b bs
dropDups _ [] = []

dropUntil :: Text -> [Token] -> [Token]
dropUntil token = drop 1 . dropWhile (not . (`hasName` token) . valOf)

sortOn :: (Ord k) => (a -> k) -> [a] -> [a]
sortOn key = L.sortBy (compare `on` key)

-- | Split list into chunks delimited by specified element.
split :: (Eq a) => a -> [a] -> [[a]]
split _ [] = []
split x xs = xs': split x (drop 1 xs'')
    where (xs', xs'') = break (==x) xs

-- | Crude predicate for Haskell files
isHsFile :: FilePath -> Bool
isHsFile fn =
  ".hs" `L.isSuffixOf` fn  || ".hsc" `L.isSuffixOf` fn || isLiterateFile fn

isLiterateFile :: FilePath -> Bool
isLiterateFile fn = ".lhs" `L.isSuffixOf` fn

merge :: Ord a => [a] -> [a] -> [a]
merge = mergeBy compare

mergeOn :: (Ord b) => (a -> b) -> [a] -> [a] -> [a]
mergeOn f = mergeBy (compare `on` f)

mergeBy :: (a -> a -> Ordering) -> [a] -> [a] -> [a]
mergeBy f xs ys = go xs ys
    where
    go []     ys     = ys
    go xs     []     = xs
    go (x:xs) (y:ys) = case f x y of
        EQ -> x: y: go xs ys
        LT -> x: go xs (y:ys)
        GT -> y: go (x:xs) ys

headt :: Text -> Maybe Char
headt = fmap fst . T.uncons

mhead :: [a] -> Maybe a
mhead [] = Nothing
mhead (x:_) = Just x

mlast :: [a] -> Maybe a
mlast xs
    | null xs = Nothing
    | otherwise = Just (last xs)