编译原理词法分析与LL1文法python简单实现与感想
从北京交流回来,原本似乎很有志气的我:“取中华尊为器,中南海为酒,饮与东西之城区,卧枕房山而眠”,(就是描写出河北转一转之后回房山某理工学校睡觉)。回来发现自己已经变成憨憨,科研竞赛似乎没我啥事,人也不大熟几个,保研也是吹了,一下子变成:“泪灌秋草难再荣,落叶凄惨满枝头。寒意必罪秋风盛,何干与我赤子心。” 消沉完了,日子还得过,继续生活吧,博客停了好久,得更新更新。
词法分析
课程中老师只让对一个简单的C语言子集合,在这里我参考了一片文章中的状态机思想,改写了一下,感觉效果emmm还说的过去,这里先给出代码,不过这个菜代码大家看看有没有可取之处就取一下,肯定bug残缺一堆,但是对我测试的例子是对的:
# -*- coding: utf-8 -*-
import string
keyword = ("auto", "break", "case", "char", "const", "continue", "default",
"do", "double", "else", "enum", "extern", "float", "for",
"goto", "if", "int", "long", "register", "return", "short",
"signed", "static", "sizeof", "struct", "switch", "typedef", "union",
"unsigned", "void", "volatile", "while") # c语言的32个关键字
user_dict = {} # 用户标识符字典
uint_dict = {} # 无符号常数字典
idlist = [] # 用户标识符列表
uintlist = [] # 无符号整数列表
'''
SS: 起始状态
FS: 接受状态
I: 接受字符
T: 转换表 //如"8": {"3": 10, "5": 9}表示状态8输入I[3]可以转换到状态10, 状态8输入I[3]可以转换到状态9
S: 状态表 //下标对应的状态的含义
'''
state_machine = {
"SS": 0,
"T": {
0: {
"2": 1,
"0": 3,
"4": 5,
"5": 6,
"6": 7,
"7": 8,
"8": 11,
"3": 14,
"1": 15,
"default": 0,
},
1: {
"0": 1,
"2": 1,
"default": 2,
},
3: {
"0": 3,
"default": 4,
},
8: {
"8": 9,
"default": 10,
},
11: {
"8": 12,
"default": 13,
}
},
"I": {
"0": ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0'],
"1": ['@', '#', '$', "^", "&", "~"],
"2": ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'],
"3": ['{', '}', '[', ']', ';', '(', ')', ',', ' ', '.', '\n', '"', "'"],
"4": ['+'],
"5": ['-'],
"6": ['*'],
"7": ['<'],
"8": ['='],
},
"FS": (2, 4, 5, 6, 7, 9, 10, 12, 13, 14, 15),
"S": ["start", 'id_s', 'id', 'uint_s', 'uint', 'op+', 'op-', 'op*', 'op<_s', 'op<=', 'op<', 'op=_s', 'op_==', 'op=', 'div', 'error']
}
def delComment(content):
# 去除代码中的注释
state = 0
index = -1
for c in content:
index = index + 1
if state == 0:
if c == '/':
state = 1
startIndex = index
elif state == 1:
if c == '*':
state = 2
else:
state = 0
elif state == 2:
if c == '*':
state = 3
else:
pass
elif state == 3:
if c == '/':
endIndex = index + 1
comment = content[startIndex:endIndex]
content = content.replace(comment, '') # 将注释替换为空,并且将下标移动
index = startIndex - 1
state = 0
elif c == '*':
pass
else:
state = 2
return content
def getConnent(filename):
content = ''
f = open(filename, 'r')
file = f.readlines()
for line in file:
if line != '\n':
content = "%s%s" % (content, line.lstrip())
# else:
# content = "%s%s" % (content, line)
f.close()
return content
def analysis(content):
o = open("output.txt", 'w+')
curr_status = 0
word = ""
l = len(content)
x = 0
while x < l:
c = content[x]
flag = 'default'
for i in state_machine["I"]:
alpha = state_machine["I"][i]
if c in alpha:
flag = i
break
if flag in state_machine['T'][curr_status]:
curr_status = state_machine['T'][curr_status][flag]
else:
curr_status = state_machine['T'][curr_status]['default']
if curr_status in state_machine["FS"]:
state = state_machine["S"][curr_status]
if curr_status == 2:
if word in keyword:
res = "<res," + word + ">"
elif word not in user_dict:
res = "<"+state+","+word+">"
idlist.append(word)
user_dict[word] = 0
else:
res = "<"+state+","+word+">"
elif curr_status == 4:
if word not in uint_dict:
res = "<"+state+","+word+">"
uintlist.append(word)
uint_dict[word] = 0
else:
res = "<" + state + "," + word + ">"
elif curr_status in (5, 6, 7, 14, 15): # 针对这几个立即接受的做判断
if c != '\n' and c != " ":
res = "<" + state + "," + c + ">"
else:
res = None
x += 1
else:
res = "<" + state + "," + word + ">"
# x -= 1
if res is not None:
o.writelines(res + '\n')
print(res)
curr_status = 0
word = ""
c = ""
x -= 1
word += c
x += 1
o.close()
if __name__ == "__main__":
content = getConnent('./test.c')
content = delComment(content)
i = open('idlist.txt', 'w+')
u = open('uintlist.txt', 'w+')
print('=======================')
print("output")
print("Enter means \\n, Space means space")
analysis(content)
print('=======================')
print("idlist")
print(idlist)
print('=======================')
print("uintlist")
print(uintlist)
for x in idlist:
i.writelines(x+'\t' + '\n')
for x in uintlist:
u.writelines(x+'\t' + '\n')
i.close()
u.close()
基本思想就是读一个转化一个状态,到接受状态就做一下处理,这么长的原因是那个状态机写的有点高,写扁了就好看了应该,这个是老师那个ppt上一个只有15个状态的非常简单的状态机,应付检查,效率之类的怕是很低,我写完之后用一个“设计”(结果符合预期)的c文件进去,还是对的,对了,把ppt的图放上去,万一用的是一个ppt😂
但是程序的思想应该是对的,剥离出来词法分析的状态转移部分,这样可以让控制程序获得一定的独立性,改了语言也能很快的修改。
LL(1) 语法分析器
First 集合
有机会会在这里给出具体的解释,这个东西还是精妙,这篇文章只是给出程序,如果要进行LL(1)那肯定就要有LL(1)分析表,想要LL(1)分析表就需要First集和Follow集,而First又是求First集的前提,所以先给出First集的求解程序。
首先还是要定义文法,比如下面这个(简单)文法: \(\begin{align*} G[E]: & E \rightarrow TA \\ & A \rightarrow +TA | \varepsilon \\ & T \rightarrow FB \\ & B \rightarrow *FB | \varepsilon \\ & F \rightarrow (E) | i \end{align*}\) 我中了字典的毒,就这么定义这个文法:
'''
语法定义
"SS"为起始符
"T"为转换条件,如
"A": {
["+", "T", "A"],
["e"]
}
代表 A->+TA|e
(e代表空)
"VT"为终止符
"VN"为非终止符
'''
grammer = {
"SS": 'E',
"T": {
"E": {
0: ["T", "A"]
},
"A": {
0: ["+", "T", "A"],
1: ["∑"]
},
"T": {
0: ["F", "B"]
},
"B": {
0: ["*", "F", "B"],
1: ["∑"]
},
"F": {
0: ["(", "E", ")"],
1: ["i"]
}
},
"VN": ["E", "A", "T", "B", "F"],
"VT": ["i", "+", "*", "(", ")", "#"],
"NN": ["∑"]
}
这样定义其实还是清晰的,这次分离的更干净,那么计算First集合的函数呢?
def first(grammer, state): # 计算一个 state 的 first 集合
ans = set()
if state in grammer["VT"]:
return {state}
next_dict = grammer["T"][state]
for key in next_dict:
next_list = next_dict[key]
for x in next_list:
if x in grammer["VT"]:
ans.add(x)
break
elif x in grammer["NN"]:
ans.add(x)
break
else:
res = first(grammer, x)
if grammer["NN"][0] not in res:
# 这代表没有办法自动匹配,
# 如X->Y1Y2..如果First(Y1)没有 NN
# 那么没有办法匹配 Y2 的First集,break
ans = ans.union(res)
break
else:
ans = ans.union(res.remove(grammer["NN"][0]))
# 注意这里就没有break了
return ans
意外的简单,python还是不错,使用这个函数就可以计算出来First集,set真好用
Follow集
计算Flollow集合我个人感觉难多了,可能我选用的方法不太对,但是也是建立在有First这个函数的前提下,这里不涉及计算的算法,还是只给出程序:
def follow(grammer, fllow_set):
nums = [len(fllow_set[x]) for x in fllow_set]
for key in grammer["T"]:
nex_dict = grammer["T"][key]
for seq in nex_dict:
nex_list = nex_dict[seq]
length = len(nex_list)
for x in range(length - 1):
char = nex_list[x]
if char in grammer["VN"]:
loc = x + 1
res = first(grammer, nex_list[loc])
if grammer["NN"][0] in res:
fllow_set[char] = fllow_set[char].union(fllow_set[key])
res.remove(grammer["NN"][0])
fllow_set[char] = fllow_set[char].union(res)
x = length - 1
if nex_list[x] in grammer["VN"]:
fllow_set[nex_list[x]] = fllow_set[nex_list[x]].union(
fllow_set[key])
n_nums = [len(fllow_set[x]) for x in fllow_set]
if n_nums == nums:
return fllow_set
else:
return follow(grammer, fllow_set)
我是用的算法是需要多次循环递归,直到Follow集合不变就输出,跟First集计算不一样,跑完了,所有的非终止符的Follow集合就出来了,原理等下一篇文章写。
LL(1)表计算
有了前面两个的铺垫,这个就好计算了,不过还是需要了解具体怎么计算的,这里给出程序:
def ll1(grammer, first_set, fllow_set, ll1_table):
vt_list = grammer["VT"]
tran = grammer["T"]
for vn in tran:
nex_dict = tran[vn]
res = first_set[vn]
if grammer["NN"][0] in res:
fow = fllow_set[vn]
for a in fow:
if a in grammer["VT"]:
ll1_table[vn][a] = vn + "->" + "∑"
res.remove(grammer["NN"][0])
for a in res:
express = ""
for key in nex_dict:
nex_list = nex_dict[key]
for x in nex_list:
if x in grammer["VN"]:
res = first_set[x]
if a in res:
express = "".join(nex_list)
break
else:
if x == a:
express = "".join(nex_list)
break
ll1_table[vn][a] = vn + "->"+express
return ll1_table
通过这个程序最后综合就可以得到一个完整的计算LL(1)分析表的程序,计算应该是对的,下面给出完整的程序。
完整程序
# -*- coding: utf-8 -*-
import string
'''
语法定义
"SS"为起始符
"T"为转换条件,如
"A": {
["+", "T", "A"],
["e"]
}
代表 A->+TA|e
(e代表空)
"VT"为终止符
"VN"为非终止符
'''
grammer = {
"SS": 'E',
"T": {
"E": {
0: ["T", "A"]
},
"A": {
0: ["+", "T", "A"],
1: ["∑"]
},
"T": {
0: ["F", "B"]
},
"B": {
0: ["*", "F", "B"],
1: ["∑"]
},
"F": {
0: ["(", "E", ")"],
1: ["i"]
}
},
"VN": ["E", "A", "T", "B", "F"],
"VT": ["i", "+", "*", "(", ")", "#"],
"NN": ["∑"]
}
# 书中习题2的文法,在这里测试程序的通用性
grammer_t = {
"SS": 'E',
"T": {
"E": {
0: ["T", "A"]
},
"A": {
0: ["+", "E"],
1: ["∑"]
},
"T": {
0: ["F", "B"]
},
"B": {
0: ["T"],
1: ["∑"]
},
"F": {
0: ["P", "C"],
},
"C": {
0: ["*", "C"],
1: ["∑"]
},
"P": {
0: ["(", "E", ")"],
1: ["a"],
2: ["b"],
3: ["^"]
}
},
"VN": ["E", "A", "T", "B", "F", "C", "P"],
"VT": ["+", "*", "(", ")", "^", "a", "b", "#"],
"NN": ["∑"]
}
fllow_set = {}
first_set = {}
ll1_table = {}
def first(grammer, state): # 计算一个 state 的 first 集合
ans = set()
if state in grammer["VT"]:
return {state}
next_dict = grammer["T"][state]
for key in next_dict:
next_list = next_dict[key]
for x in next_list:
if x in grammer["VT"]:
ans.add(x)
break
elif x in grammer["NN"]:
ans.add(x)
break
else:
res = first(grammer, x)
if grammer["NN"][0] not in res:
# 这代表没有办法自动匹配,
# 如X->Y1Y2..如果First(Y1)没有 NN
# 那么没有办法匹配 Y2 的First集,break
ans = ans.union(res)
break
else:
ans = ans.union(res.remove(grammer["NN"][0]))
# 注意这里就没有break了
return ans
def follow(grammer, fllow_set):
nums = [len(fllow_set[x]) for x in fllow_set]
for key in grammer["T"]:
nex_dict = grammer["T"][key]
for seq in nex_dict:
nex_list = nex_dict[seq]
length = len(nex_list)
for x in range(length - 1):
char = nex_list[x]
if char in grammer["VN"]:
loc = x + 1
res = first(grammer, nex_list[loc])
if grammer["NN"][0] in res:
fllow_set[char] = fllow_set[char].union(fllow_set[key])
res.remove(grammer["NN"][0])
fllow_set[char] = fllow_set[char].union(res)
x = length - 1
if nex_list[x] in grammer["VN"]:
fllow_set[nex_list[x]] = fllow_set[nex_list[x]].union(
fllow_set[key])
n_nums = [len(fllow_set[x]) for x in fllow_set]
if n_nums == nums:
return fllow_set
else:
return follow(grammer, fllow_set)
def ll1(grammer, first_set, fllow_set, ll1_table):
vt_list = grammer["VT"]
tran = grammer["T"]
for vn in tran:
nex_dict = tran[vn]
res = first_set[vn]
if grammer["NN"][0] in res:
fow = fllow_set[vn]
for a in fow:
if a in grammer["VT"]:
ll1_table[vn][a] = vn + "->" + "∑"
res.remove(grammer["NN"][0])
for a in res:
express = ""
for key in nex_dict:
nex_list = nex_dict[key]
for x in nex_list:
if x in grammer["VN"]:
res = first_set[x]
if a in res:
express = "".join(nex_list)
break
else:
if x == a:
express = "".join(nex_list)
break
ll1_table[vn][a] = vn + "->"+express
return ll1_table
if __name__ == "__main__":
grammer = grammer_t # 记得修改呦!自己用的时候~
for x in grammer["VN"]:
fllow_set[x] = set()
fllow_set[grammer["SS"]].add("#")
fllow_set = follow(grammer, fllow_set)
for x in grammer["VN"]:
first_set[x] = first(grammer, x)
ll1_table[x] = {}
for y in grammer["VT"]:
ll1_table[x][y] = "error"
ll1_table = ll1(grammer, first_set, fllow_set, ll1_table)
print('=======================')
print("first set")
print('=======================')
for key in grammer["VN"]:
print(key, first(grammer, key))
print('=======================')
print("fllow set")
print('=======================')
for key in fllow_set:
print(key, fllow_set[key])
print('=======================')
print("ll1 table")
print("VN\VT", end="\t")
for x in grammer["VT"]:
print(x, end="\t")
print()
for vn in ll1_table:
nex_dict = ll1_table[vn]
print(vn, end="\t")
for key in nex_dict:
print(nex_dict[key], end="\t")
print()
print('=======================')
在这里grammer是那个简单的文法,而grammer_t如程序中写的是一个习题的文法,话说我自己写出来这个程序能求对这些玩意,我当时手算就不会了,结果靠自己的程序来跑出来结果 😂
结尾
这样就完成了这么一个作业,写的本菜鸡还是废了点时间,至于说开头发的牢骚,诶,只能继续生存下去了,似乎还想继续上学,所以应该会去考研,真是鸡儿难,愿上帝庇佑我。