July 31, 2020
After a few iterations, we now have a parser that
takes a Java source file and parses it in a way that distinguishes between comments and
code. The previous parser produces a
[String], where each String is either a comment or code.
But all the parts have the same type, which means that without reparsing somehow –
at a minimum, looking at the first few characters of each String – there is no way
to tell whether a given String is code or comment.
In this version, the Java source is parsed to an AST, although the "AST" is not very Abstract and is certainly not a Tree – about all it does involve is Syntax. The program starts off just as in the previous two cases (with one small change in red):
{-# LANGUAGE LambdaCase #-}
import System.Environment
import System.Directory
import Text.ParserCombinators.Parsec
import Control.Monad
main = do
args <- getArgs
(argsValid args) >>= \case
False -> putStrLn "Give me a single valid file name."
True -> do
contents <- readFile $ head args
let result = parseJava contents
case result of
Left err -> putStrLn (show(err))
Right valid -> mapM_ putStr $ map show valid
argsValid :: [String] -> IO Bool
argsValid names = do
if (null names) || (length names /= 1)
then return False
else doesFileExist $ head $ names
The parser is also nearly identical, but the top level of the parser no longer returns
[String]; instead it returns [ParsedJava] (or
here is the complete version):
data ParsedJava = SLComment String | MLComment String |
JavaCode String | WhiteSpace String
instance Show ParsedJava where
show (SLComment s) = s
show (MLComment s) = s
show (JavaCode s) = s
show (WhiteSpace s) = s
parseJava :: String -> Either ParseError [ParsedJava]
parseJava input = parse parseJavaInput "" input
parseJavaInput :: Parser [ParsedJava]
parseJavaInput = manyTill javaPart eof
javaPart :: Parser ParsedJava
javaPart = parseSLComment <|> parseMLComment <|> javaCode
parseSLComment :: Parser ParsedJava
parseSLComment = do
try $ string "//"
guts <- manyTill anyChar (try $ string "\n")
return $ SLComment ("//" ++ guts ++ "\n")
parseMLComment :: Parser ParsedJava
parseMLComment = do
try $ string "/*"
guts <- manyTill anyChar (try $ string "*/")
return $ MLComment ("/*" ++ guts ++ "*/")
javaCode :: Parser ParsedJava
javaCode = do
x <- many1 javaBite
return $ JavaCode $ concat x
javaBite :: Parser String
javaBite = stringCode <|> quoteCode <|> nonStringCode
stringCode :: Parser String
stringCode = do
try $ char '"'
x <- manyTill stringChar (string "\"")
return ("\"" ++ concat x ++ "\"")
stringChar :: Parser String
stringChar = stringNonEscape <|> stringEscape
stringNonEscape :: Parser String
stringNonEscape = do
x <- noneOf "\\\""
return [x]
stringEscape :: Parser String
stringEscape = do
d <- char '\\'
c <- oneOf "\\\"0nrvtbf'u"
return [d,c]
quoteCode :: Parser String
quoteCode = do
try $ char '\''
x <- manyTill quoteChar (string "'")
return ("'" ++ concat x ++ "'")
quoteChar :: Parser String
quoteChar = quoteNonEscape <|> quoteEscape
quoteNonEscape :: Parser String
quoteNonEscape = do
x <- noneOf "\\'"
return [x]
quoteEscape :: Parser String
quoteEscape = do
d <- char '\\'
c <- oneOf "\\\"0nrvtbf'u"
return [d,c]
nonStringCode :: Parser String
nonStringCode = (many1 $ noneOf "/'\"") <|> try goodSlash
goodSlash :: Parser String
goodSlash = do
x <- char '/'
y <- noneOf "/*"
return [x,y]