编译原理词法分析与LL1文法python简单程序与感想

 

编译原理词法分析与LL1文法python简单实现与感想从北京交流回来,原本似乎很有志气的我:“取中华尊为器,中南海为酒,饮与东西之城区,卧枕房山而眠”,(就是描写出河北转一转之后回房山某理工学校睡觉)。回来发现自己已经变成憨憨,科研竞赛似乎没我啥事,人也不大熟几个,保研也是吹了,一下子变成:“泪灌秋草难再荣,落叶凄惨满枝头。寒意必罪秋风盛,何干与我赤子心。” 消沉完了,日子还得过,继续生活吧...

编译原理词法分析与LL1文法python简单实现与感想

从北京交流回来,原本似乎很有志气的我:“取中华尊为器,中南海为酒,饮与东西之城区,卧枕房山而眠”,(就是描写出河北转一转之后回房山某理工学校睡觉)。回来发现自己已经变成憨憨,科研竞赛似乎没我啥事,人也不大熟几个,保研也是吹了,一下子变成:“泪灌秋草难再荣,落叶凄惨满枝头。寒意必罪秋风盛,何干与我赤子心。” 消沉完了,日子还得过,继续生活吧,博客停了好久,得更新更新。

词法分析

课程中老师只让对一个简单的C语言子集合,在这里我参考了一片文章中的状态机思想,改写了一下,感觉效果emmm还说的过去,这里先给出代码,不过这个菜代码大家看看有没有可取之处就取一下,肯定bug残缺一堆,但是对我测试的例子是对的:

# -*- coding: utf-8 -*-

import string

keyword = ("auto", "break", "case", "char", "const", "continue", "default",
           "do", "double", "else", "enum", "extern", "float", "for",
           "goto", "if", "int", "long", "register", "return", "short",
           "signed", "static", "sizeof", "struct", "switch", "typedef", "union",
           "unsigned", "void", "volatile", "while")  # c语言的32个关键字

user_dict = {}  # 用户标识符字典
uint_dict = {}  # 无符号常数字典
idlist = []  # 用户标识符列表
uintlist = []  # 无符号整数列表

'''
SS: 起始状态
FS: 接受状态
I: 接受字符
T: 转换表   //如"8": {"3": 10, "5": 9}表示状态8输入I[3]可以转换到状态10, 状态8输入I[3]可以转换到状态9
S: 状态表    //下标对应的状态的含义
'''
state_machine = {
    "SS": 0,
    "T": {
        0: {
            "2": 1,
            "0": 3,
            "4": 5,
            "5": 6,
            "6": 7,
            "7": 8,
            "8": 11,
            "3": 14,
            "1": 15,
            "default": 0,
        },
        1: {
            "0": 1,
            "2": 1,
            "default": 2,
        },
        3: {
            "0": 3,
            "default": 4,
        },
        8: {
            "8": 9,
            "default": 10,
        },
        11: {
            "8": 12,
            "default": 13,
        }
    },
    "I": {
        "0": ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0'],
        "1": ['@', '#', '$', "^", "&", "~"],
        "2": ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
              'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'],
        "3": ['{', '}', '[', ']', ';', '(', ')', ',', ' ', '.', '\n', '"', "'"],
        "4": ['+'],
        "5": ['-'],
        "6": ['*'],
        "7": ['<'],
        "8": ['='],
    },
    "FS": (2, 4, 5, 6, 7, 9, 10, 12, 13, 14, 15),
    "S": ["start", 'id_s', 'id', 'uint_s', 'uint', 'op+', 'op-', 'op*', 'op<_s', 'op<=', 'op<', 'op=_s', 'op_==', 'op=', 'div', 'error']
}


def delComment(content):
    # 去除代码中的注释
    state = 0
    index = -1

    for c in content:
        index = index + 1

        if state == 0:
            if c == '/':
                state = 1
                startIndex = index

        elif state == 1:
            if c == '*':
                state = 2
            else:
                state = 0

        elif state == 2:
            if c == '*':
                state = 3
            else:
                pass

        elif state == 3:
            if c == '/':
                endIndex = index + 1
                comment = content[startIndex:endIndex]
                content = content.replace(comment, '')  # 将注释替换为空,并且将下标移动
                index = startIndex - 1
                state = 0

            elif c == '*':
                pass
            else:
                state = 2
    return content


def getConnent(filename):
    content = ''
    f = open(filename, 'r')
    file = f.readlines()
    for line in file:
        if line != '\n':
            content = "%s%s" % (content, line.lstrip())
        # else:
        #     content = "%s%s" % (content, line)
    f.close()
    return content


def analysis(content):
    o = open("output.txt", 'w+')
    curr_status = 0
    word = ""
    l = len(content)
    x = 0
    while x < l:
        c = content[x]
        flag = 'default'
        for i in state_machine["I"]:
            alpha = state_machine["I"][i]
            if c in alpha:
                flag = i
                break
        if flag in state_machine['T'][curr_status]:
            curr_status = state_machine['T'][curr_status][flag]
        else:
            curr_status = state_machine['T'][curr_status]['default']
        if curr_status in state_machine["FS"]:
            state = state_machine["S"][curr_status]
            if curr_status == 2:
                if word in keyword:
                    res = "<res," + word + ">"
                elif word not in user_dict:
                    res = "<"+state+","+word+">"
                    idlist.append(word)
                    user_dict[word] = 0
                else:
                    res = "<"+state+","+word+">"
            elif curr_status == 4:
                if word not in uint_dict:
                    res = "<"+state+","+word+">"
                    uintlist.append(word)
                    uint_dict[word] = 0
                else:
                    res = "<" + state + "," + word + ">"
            elif curr_status in (5, 6, 7, 14, 15):  # 针对这几个立即接受的做判断
                if c != '\n' and c != " ":
                    res = "<" + state + "," + c + ">"
                else:
                    res = None
                x += 1
            else:
                res = "<" + state + "," + word + ">"
                # x -= 1
            if res is not None:
                o.writelines(res + '\n')
                print(res)
            curr_status = 0
            word = ""
            c = ""
            x -= 1
        word += c
        x += 1
    o.close()


if __name__ == "__main__":
    content = getConnent('./test.c')
    content = delComment(content)
    i = open('idlist.txt', 'w+')
    u = open('uintlist.txt', 'w+')
    print('=======================')
    print("output")
    print("Enter means \\n, Space means space")
    analysis(content)
    print('=======================')
    print("idlist")
    print(idlist)
    print('=======================')
    print("uintlist")
    print(uintlist)
    for x in idlist:
        i.writelines(x+'\t' + '\n')
    for x in uintlist:
        u.writelines(x+'\t' + '\n')
    i.close()
    u.close()

基本思想就是读一个转化一个状态,到接受状态就做一下处理,这么长的原因是那个状态机写的有点高,写扁了就好看了应该,这个是老师那个ppt上一个只有15个状态的非常简单的状态机,应付检查,效率之类的怕是很低,我写完之后用一个“设计”(结果符合预期)的c文件进去,还是对的,对了,把ppt的图放上去,万一用的是一个ppt😂

词法分析状态C语言子集

但是程序的思想应该是对的,剥离出来词法分析的状态转移部分,这样可以让控制程序获得一定的独立性,改了语言也能很快的修改。

LL(1) 语法分析器

First 集合

有机会会在这里给出具体的解释,这个东西还是精妙,这篇文章只是给出程序,如果要进行LL(1)那肯定就要有LL(1)分析表,想要LL(1)分析表就需要First集和Follow集,而First又是求First集的前提,所以先给出First集的求解程序。

首先还是要定义文法,比如下面这个(简单)文法: 我中了字典的毒,就这么定义这个文法:

'''
语法定义
"SS"为起始符
"T"为转换条件,如
"A": {
    ["+", "T", "A"],
    ["e"]
}
代表 A->+TA|e
(e代表空)
"VT"为终止符
"VN"为非终止符
'''
grammer = {
    "SS": 'E',
    "T": {
        "E": {
            0: ["T", "A"]
        },
        "A": {
            0: ["+", "T", "A"],
            1: ["∑"]
        },
        "T": {
            0: ["F", "B"]
        },
        "B": {
            0: ["*", "F", "B"],
            1: ["∑"]
        },
        "F": {
            0: ["(", "E", ")"],
            1: ["i"]
        }
    },
    "VN": ["E", "A", "T", "B", "F"],
    "VT": ["i", "+", "*", "(", ")", "#"],
    "NN": ["∑"]
}

这样定义其实还是清晰的,这次分离的更干净,那么计算First集合的函数呢?

def first(grammer, state):  # 计算一个 state 的 first 集合
    ans = set()
    if state in grammer["VT"]:
        return {state}
    next_dict = grammer["T"][state]
    for key in next_dict:
        next_list = next_dict[key]
        for x in next_list:
            if x in grammer["VT"]:
                ans.add(x)
                break
            elif x in grammer["NN"]:
                ans.add(x)
                break
            else:
                res = first(grammer, x)
                if grammer["NN"][0] not in res:
                    # 这代表没有办法自动匹配,
                    # 如X->Y1Y2..如果First(Y1)没有 NN
                    # 那么没有办法匹配 Y2 的First集,break
                    ans = ans.union(res)
                    break
                else:
                    ans = ans.union(res.remove(grammer["NN"][0]))
                    # 注意这里就没有break了
    return ans

意外的简单,python还是不错,使用这个函数就可以计算出来First集,set真好用

Follow集

计算Flollow集合我个人感觉难多了,可能我选用的方法不太对,但是也是建立在有First这个函数的前提下,这里不涉及计算的算法,还是只给出程序:

def follow(grammer, fllow_set):
    nums = [len(fllow_set[x]) for x in fllow_set]
    for key in grammer["T"]:
        nex_dict = grammer["T"][key]
        for seq in nex_dict:
            nex_list = nex_dict[seq]
            length = len(nex_list)
            for x in range(length - 1):
                char = nex_list[x]
                if char in grammer["VN"]:
                    loc = x + 1
                    res = first(grammer, nex_list[loc])
                    if grammer["NN"][0] in res:
                        fllow_set[char] = fllow_set[char].union(fllow_set[key])
                        res.remove(grammer["NN"][0])
                    fllow_set[char] = fllow_set[char].union(res)
            x = length - 1
            if nex_list[x] in grammer["VN"]:
                fllow_set[nex_list[x]] = fllow_set[nex_list[x]].union(
                    fllow_set[key])
    n_nums = [len(fllow_set[x]) for x in fllow_set]
    if n_nums == nums:
        return fllow_set
    else:
        return follow(grammer, fllow_set)

我是用的算法是需要多次循环递归,直到Follow集合不变就输出,跟First集计算不一样,跑完了,所有的非终止符的Follow集合就出来了,原理等下一篇文章写。

LL(1)表计算

有了前面两个的铺垫,这个就好计算了,不过还是需要了解具体怎么计算的,这里给出程序:

def ll1(grammer, first_set, fllow_set, ll1_table):
    vt_list = grammer["VT"]
    tran = grammer["T"]
    for vn in tran:
        nex_dict = tran[vn]
        res = first_set[vn]
        if grammer["NN"][0] in res:
            fow = fllow_set[vn]
            for a in fow:
                if a in grammer["VT"]:
                    ll1_table[vn][a] = vn + "->" + "∑"
            res.remove(grammer["NN"][0])
        for a in res:
            express = ""
            for key in nex_dict:
                nex_list = nex_dict[key]
                for x in nex_list:
                    if x in grammer["VN"]:
                        res = first_set[x]
                        if a in res:
                            express = "".join(nex_list)
                            break
                    else:
                        if x == a:
                            express = "".join(nex_list)
                            break
            ll1_table[vn][a] = vn + "->"+express
    return ll1_table

通过这个程序最后综合就可以得到一个完整的计算LL(1)分析表的程序,计算应该是对的,下面给出完整的程序。

完整程序

# -*- coding: utf-8 -*-

import string

'''
语法定义
"SS"为起始符
"T"为转换条件,如
"A": {
    ["+", "T", "A"],
    ["e"]
}
代表 A->+TA|e
(e代表空)
"VT"为终止符
"VN"为非终止符
'''
grammer = {
    "SS": 'E',
    "T": {
        "E": {
            0: ["T", "A"]
        },
        "A": {
            0: ["+", "T", "A"],
            1: ["∑"]
        },
        "T": {
            0: ["F", "B"]
        },
        "B": {
            0: ["*", "F", "B"],
            1: ["∑"]
        },
        "F": {
            0: ["(", "E", ")"],
            1: ["i"]
        }
    },
    "VN": ["E", "A", "T", "B", "F"],
    "VT": ["i", "+", "*", "(", ")", "#"],
    "NN": ["∑"]
}

# 书中习题2的文法,在这里测试程序的通用性
grammer_t = {
    "SS": 'E',
    "T": {
        "E": {
            0: ["T", "A"]
        },
        "A": {
            0: ["+", "E"],
            1: ["∑"]
        },
        "T": {
            0: ["F", "B"]
        },
        "B": {
            0: ["T"],
            1: ["∑"]
        },
        "F": {
            0: ["P", "C"],
        },
        "C": {
            0: ["*", "C"],
            1: ["∑"]
        },
        "P": {
            0: ["(", "E", ")"],
            1: ["a"],
            2: ["b"],
            3: ["^"]

        }
    },
    "VN": ["E", "A", "T", "B", "F", "C", "P"],
    "VT": ["+", "*", "(", ")", "^", "a", "b", "#"],
    "NN": ["∑"]
}

fllow_set = {}
first_set = {}
ll1_table = {}


def first(grammer, state):  # 计算一个 state 的 first 集合
    ans = set()
    if state in grammer["VT"]:
        return {state}
    next_dict = grammer["T"][state]
    for key in next_dict:
        next_list = next_dict[key]
        for x in next_list:
            if x in grammer["VT"]:
                ans.add(x)
                break
            elif x in grammer["NN"]:
                ans.add(x)
                break
            else:
                res = first(grammer, x)
                if grammer["NN"][0] not in res:
                    # 这代表没有办法自动匹配,
                    # 如X->Y1Y2..如果First(Y1)没有 NN
                    # 那么没有办法匹配 Y2 的First集,break
                    ans = ans.union(res)
                    break
                else:
                    ans = ans.union(res.remove(grammer["NN"][0]))
                    # 注意这里就没有break了
    return ans


def follow(grammer, fllow_set):
    nums = [len(fllow_set[x]) for x in fllow_set]
    for key in grammer["T"]:
        nex_dict = grammer["T"][key]
        for seq in nex_dict:
            nex_list = nex_dict[seq]
            length = len(nex_list)
            for x in range(length - 1):
                char = nex_list[x]
                if char in grammer["VN"]:
                    loc = x + 1
                    res = first(grammer, nex_list[loc])
                    if grammer["NN"][0] in res:
                        fllow_set[char] = fllow_set[char].union(fllow_set[key])
                        res.remove(grammer["NN"][0])
                    fllow_set[char] = fllow_set[char].union(res)
            x = length - 1
            if nex_list[x] in grammer["VN"]:
                fllow_set[nex_list[x]] = fllow_set[nex_list[x]].union(
                    fllow_set[key])
    n_nums = [len(fllow_set[x]) for x in fllow_set]
    if n_nums == nums:
        return fllow_set
    else:
        return follow(grammer, fllow_set)


def ll1(grammer, first_set, fllow_set, ll1_table):
    vt_list = grammer["VT"]
    tran = grammer["T"]
    for vn in tran:
        nex_dict = tran[vn]
        res = first_set[vn]
        if grammer["NN"][0] in res:
            fow = fllow_set[vn]
            for a in fow:
                if a in grammer["VT"]:
                    ll1_table[vn][a] = vn + "->" + "∑"
            res.remove(grammer["NN"][0])
        for a in res:
            express = ""
            for key in nex_dict:
                nex_list = nex_dict[key]
                for x in nex_list:
                    if x in grammer["VN"]:
                        res = first_set[x]
                        if a in res:
                            express = "".join(nex_list)
                            break
                    else:
                        if x == a:
                            express = "".join(nex_list)
                            break
            ll1_table[vn][a] = vn + "->"+express

    return ll1_table


if __name__ == "__main__":
    grammer = grammer_t # 记得修改呦!自己用的时候~
    for x in grammer["VN"]:
        fllow_set[x] = set()
    fllow_set[grammer["SS"]].add("#")
    fllow_set = follow(grammer, fllow_set)
    for x in grammer["VN"]:
        first_set[x] = first(grammer, x)
        ll1_table[x] = {}
        for y in grammer["VT"]:
            ll1_table[x][y] = "error"
    ll1_table = ll1(grammer, first_set, fllow_set, ll1_table)
    print('=======================')
    print("first set")
    print('=======================')
    for key in grammer["VN"]:
        print(key, first(grammer, key))
    print('=======================')
    print("fllow set")
    print('=======================')
    for key in fllow_set:
        print(key, fllow_set[key])
    print('=======================')
    print("ll1 table")
    print("VN\VT", end="\t")
    for x in grammer["VT"]:
        print(x, end="\t")
    print()
    for vn in ll1_table:
        nex_dict = ll1_table[vn]
        print(vn, end="\t")
        for key in nex_dict:
            print(nex_dict[key], end="\t")
        print()
    print('=======================')

在这里grammer是那个简单的文法,而grammer_t如程序中写的是一个习题的文法,话说我自己写出来这个程序能求对这些玩意,我当时手算就不会了,结果靠自己的程序来跑出来结果 😂

结尾

这样就完成了这么一个作业,写的本菜鸡还是废了点时间,至于说开头发的牢骚,诶,只能继续生存下去了,似乎还想继续上学,所以应该会去考研,真是鸡儿难,愿上帝庇佑我。