词法解析re2c实例

3 minute read

编译阶段,php使用分别使用re2c、bison来完成词法分析、语法分析的工作来生成抽象语法书AST re2c: 词法分析器,将输入分割为一个个有意义的词块,称为token bison: 语法分析器,确定词法分析器分割出的token是如何彼此关联的

re2c官网地址:http://re2c.org

实例1 创建文件re2c_numbers.l

#include <stdio.h>

typedef enum { ERR, BIN, OCT, DEC, HEX } NUM_T;

static NUM_T lex(const char *YYCURSOR)
{
    const char *YYMARKER;
    /*!re2c
        re2c:define:YYCTYPE = char;
        re2c:yyfill:enable = 0;

        end = "\x00";
        bin = '0b' [01]+;
        oct = "0" [0-7]*;
        dec = [1-9][0-9]*;
        hex = '0x' [0-9a-fA-F]+;

        *       { return ERR; }
        bin end { return BIN; }
        oct end { return OCT; }
        dec end { return DEC; }
        hex end { return HEX; }
    */
}

int main(int argc, char **argv)
{
    for (int i = 1; i < argc; ++i) {
        switch (lex(argv[i])) {
            case ERR: printf("error\n"); break;
            case BIN: printf("binary\n"); break;
            case OCT: printf("octal\n"); break;
            case DEC: printf("decimal\n"); break;
            case HEX: printf("hexadecimal\n"); break;
        }
    }
    return 0;
}

执行命令 bash re2c re2c_numbers.l -o re2c_numbers.c 会生成文件re2c_numbers.c,查看内容会发现原文件中注释里的re2c规则已经被替换成相应的c代码。 re2c_numbers.c内容如下

/* Generated by re2c 0.16 on Tue Mar  5 04:32:31 2019 */
#line 1 "re2c_test.l"
#include <stdio.h>

typedef enum { ERR, BIN, OCT, DEC, HEX } NUM_T;

static NUM_T lex(const char *YYCURSOR)
{
    const char *YYMARKER;

#line 12 "re2c_test.c"
{
	char yych;
	yych = *YYCURSOR;
	switch (yych) {
	case '0':	goto yy4;
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':	goto yy5;
	default:	goto yy2;
	}
yy2:
	++YYCURSOR;
yy3:
#line 18 "re2c_test.l"
	{ return ERR; }
#line 34 "re2c_test.c"
yy4:
	yych = *(YYMARKER = ++YYCURSOR);
	switch (yych) {
	case 0x00:	goto yy6;
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':	goto yy8;
	case 'B':
	case 'b':	goto yy11;
	case 'X':
	case 'x':	goto yy12;
	default:	goto yy3;
	}
yy5:
	yych = *(YYMARKER = ++YYCURSOR);
	switch (yych) {
	case 0x00:	goto yy13;
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':	goto yy15;
	default:	goto yy3;
	}
yy6:
	++YYCURSOR;
#line 20 "re2c_test.l"
	{ return OCT; }
#line 73 "re2c_test.c"
yy8:
	++YYCURSOR;
	yych = *YYCURSOR;
	switch (yych) {
	case 0x00:	goto yy6;
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':	goto yy8;
	default:	goto yy10;
	}
yy10:
	YYCURSOR = YYMARKER;
	goto yy3;
yy11:
	yych = *++YYCURSOR;
	if (yych <= 0x00) goto yy10;
	goto yy18;
yy12:
	yych = *++YYCURSOR;
	if (yych <= 0x00) goto yy10;
	goto yy20;
yy13:
	++YYCURSOR;
#line 21 "re2c_test.l"
	{ return DEC; }
#line 104 "re2c_test.c"
yy15:
	++YYCURSOR;
	yych = *YYCURSOR;
	switch (yych) {
	case 0x00:	goto yy13;
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':	goto yy15;
	default:	goto yy10;
	}
yy17:
	++YYCURSOR;
	yych = *YYCURSOR;
yy18:
	switch (yych) {
	case 0x00:	goto yy21;
	case '0':
	case '1':	goto yy17;
	default:	goto yy10;
	}
yy19:
	++YYCURSOR;
	yych = *YYCURSOR;
yy20:
	switch (yych) {
	case 0x00:	goto yy23;
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
	case 'A':
	case 'B':
	case 'C':
	case 'D':
	case 'E':
	case 'F':
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'e':
	case 'f':	goto yy19;
	default:	goto yy10;
	}
yy21:
	++YYCURSOR;
#line 19 "re2c_test.l"
	{ return BIN; }
#line 166 "re2c_test.c"
yy23:
	++YYCURSOR;
#line 22 "re2c_test.l"
	{ return HEX; }
#line 171 "re2c_test.c"
}
#line 23 "re2c_test.l"

}

int main(int argc, char **argv)
{
    for (int i = 1; i < argc; ++i) {
        switch (lex(argv[i])) {
            case ERR: printf("error\n"); break;
            case BIN: printf("binary\n"); break;
            case OCT: printf("octal\n"); break;
            case DEC: printf("decimal\n"); break;
            case HEX: printf("hexadecimal\n"); break;
        }
    }
    return 0;
}

编译re2c生成的c代码文件,生成可执行文件(a.out)并执行,可以看到返回结果,判定输入内容为十进制

gcc re2c_numbers.c
./a.out 2323
decimal

Updated: