2020-05-15

Toy-Browser-...

写在前面

  • implementation of a toy-browser 🙆
  • DAY1,我们已经完成 HTTP相关解析,现在我们可以写 HTML 的解析啦,开不开心!😝
  • HTML 的解析

实践过程

第一步:拆分文件

  • 为了方便文件管理,我们把parse单独拆到文件中
  • parser 接受 HTML 文本作为参数,返回一颗 DOM 树
  • server.js

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    const server = http.createServer((req, res) => {

    ... some code

    res.end(
    `<html maaa=a >
    <head>
    <style>
    body div #myid{
    width:100px;
    background-color: #ff5000;
    }
    body div img{
    width:30px;
    background-color: #ff1111;
    }
    </style>
    </head>
    <body>
    <div>
    <img id="myid"/>
    <img />
    </div>
    </body>
    </html>`);
    });

    server.listen(8088);
  • client.js

    1
    2
    3
    4
    5
    6
    7
    8
    9
    ... some code

    void async function () {

    ... some code

    let dom = parser.parseHTML(response.body)

    }()
  • parseHTML1.js

1
2
3
4
5
6
7
// 拆分文件

module.exports.parseHTML = function parseHTML(html){

console.log(html)

}
  • 运行结果
    • 运行结果

第二步:创建状态机

  • 我们用 FSM 来实现 HTML 的分析
  • 在 HTML 标准中,已经规定了 HTML 的状态
  • Toy-Browser 只挑选其中的一部分状态,完成一个最简版本
  • parseHTML2.js

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    // 初始化 FSM - Finite State Machine

    const EOF = Symbol("EOF")

    function data(char) {

    }

    module.exports.parseHTML = function parseHTML(html){

    let state = data

    for (let char of html) {
    state = state(c)
    }

    state = state(EOF)

    }

第三步:解析标签

  • 主要的标签有:开始标签,结束标签和自封闭标签

  • 在这一步我们暂时忽略属性

  • 【未完:状态图分析】

  • parseHTML3.js

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    // 解析标签

    const EOF = Symbol("EOF")

    function data(char) {
    if (char == "<") {
    return tagOpen
    } else if (char == EOF) {
    return
    } else {
    return data
    }
    }


    // 1. 开始标签
    // 2. 结束标签
    // 3. 自封闭标签
    function tagOpen(char) {
    if (char == "/") { // 结束标签
    return endTagOpen
    } else if (char.match(/^[a-zA-Z]$/)) { // 开始标签
    return tagName(char)
    } else {
    return
    }
    }


    function endTagOpen(char) {
    if (char.match(/^[a-zA-Z]$/)) {
    return tagName(char)
    } else if (char == ">") {

    } else if(char == EOF) {

    }
    }


    function tagName(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    return beforeAttributeName
    } else if (char == "/") {
    return selfClosingStartTag
    } else if (char.match(/^[a-zA-Z]$/)) {
    return tagName
    } else if (char == ">") {
    return data
    } else {
    return tagName
    }
    }


    function beforeAttributeName(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    return beforeAttributeName
    } else if (char == ">") {
    return data
    } else if (char == "=") {
    return beforeAttributeName
    } else {
    return beforeAttributeName
    }
    }


    function selfClosingStartTag(char) {
    if (char == ">") {
    return data
    } else if (char == "EOF") {

    } else {

    }
    }

    module.exports.parseHTML = function parseHTML(html){

    let state = data

    for (let char of html) {
    state = state(char)
    }

    state = state(EOF)

    }

第四步:创建元素

  • 在状态中,除了状态迁移,我们还会要加入业务逻辑

  • 我们在标签结束状态提交标签 token

  • parseHTML4.js

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    // emitToken 创建元素

    let currentToken = null

    function emit(token) {
    if (token.type != "text")
    console.log(token)
    }

    const EOF = Symbol("EOF")

    function data(char) {
    if (char == "<") {
    return tagOpen
    } else if (char == "/") {
    return selfClosingStartTag
    } else if (char == ">") {
    emit(currentToken)
    return data
    } else if (char == EOF) {
    emit({
    type: "EOF"
    })
    return
    } else {
    return data
    }
    }


    // 1. 开始标签
    // 2. 结束标签
    // 3. 自封闭标签
    function tagOpen(char) {
    if (char == "/") { // 结束标签
    return endTagOpen
    } else if (char.match(/^[a-zA-Z]$/)) { // 开始标签
    currentToken = {
    type: "startTag",
    tagName: ""
    }
    return tagName(char)
    } else {
    return
    }
    }


    function endTagOpen(char) {
    if (char.match(/^[a-zA-Z]$/)) {
    currentToken = {
    type: "endTag",
    tagName: ""
    }
    return tagName(char)
    } else if (char == ">") {

    } else if(char == EOF) {

    }
    }


    function tagName(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    return beforeAttributeName(char)
    } else if (char == "/") {
    return selfClosingStartTag
    } else if (char.match(/^[a-zA-Z]$/)) {
    currentToken.tagName += char.toLowerCase()
    return tagName
    } else if (char == ">") {
    emit(currentToken)
    return data
    } else {
    return tagName
    }
    }


    function beforeAttributeName(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    return beforeAttributeName
    } else if (char == ">") {
    return beforeAttributeName
    } else if (char == "/") {
    return selfClosingStartTag
    } else if (char == EOF) {
    return
    } else if (char == "=") {
    return data
    } else {
    return beforeAttributeName
    }
    }


    function selfClosingStartTag(char) {
    if (char == ">" || char == "/") {
    currentToken.isSelfClosing = true
    currentToken.type = "selfClosingTag"
    emit(currentToken)
    return data
    } else if (char == "EOF") {

    } else {

    }
    }

    module.exports.parseHTML = function parseHTML(html){

    let state = data

    for (let char of html) {
    state = state(char)
    }

    state = state(EOF)

    }
  • 运行结果

    • 第四步运行结果

第五步:处理属性

  • 属性分为单引号、双引号、无引号三种写法,因此需要较多状态处理

  • 处理属性的方式跟标签类似

  • 属性结束时,我们把属性加到标签 token 上

  • parseHTML5.js

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    230
    231
    232
    // 处理属性 attribute

    let currentToken = null
    let currentAttribute = null

    function emit(token) {
    if (token.type != "text")
    console.log(token)
    }

    const EOF = Symbol("EOF")

    function data(char) {
    if (char == "<") {
    return tagOpen
    } else if (char == EOF) {
    emit({
    type: "EOF"
    })
    return
    } else {
    emit({
    type: "text",
    content: char
    })
    return data
    }
    }


    // 1. 开始标签
    // 2. 结束标签
    // 3. 自封闭标签
    function tagOpen(char) {
    if (char == "/") { // 结束标签
    return endTagOpen
    } else if (char.match(/^[a-zA-Z]$/)) { // 开始标签
    currentToken = {
    type: "startTag",
    tagName: ""
    }
    return tagName(char)
    } else {
    // return data
    }
    }


    function endTagOpen(char) {
    if (char.match(/^[a-zA-Z]$/)) {
    currentToken = {
    type: "endTag",
    tagName: ""
    }
    return tagName(char)
    } else if (char == ">") {
    // return data
    } else if(char == EOF) {
    // return data
    }
    }


    function tagName(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    return beforeAttributeName(char)
    } else if (char == "/") {
    return selfClosingStartTag
    } else if (char.match(/^[a-zA-Z]$/)) {
    currentToken.tagName += char.toLowerCase()
    return tagName
    } else if (char == ">") {
    emit(currentToken)
    return data
    } else {
    return tagName
    }
    }


    function beforeAttributeName(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    return beforeAttributeName
    } else if (char == ">" || char == "/" || char == EOF) {
    return afterAttributeName(char)
    } else if (char == "=") {
    return
    } else {
    currentAttribute = {
    name: "",
    value: ""
    }
    return attributeName(char)
    }
    }

    function afterAttributeName(char) {
    if (char == "/") {
    return selfClosingStartTag
    } else if (char == EOF) {
    return
    } else {
    emit(currentToken)
    return data
    }
    }

    function attributeName(char) {
    if (char.match(/^[\t\n\f ]$/) || char == "/" || char == ">" || char == EOF) {
    return afterAttributeName(char)
    } else if (char == "=") {
    return beforeAttributeValue
    } else if (char == "\u0000") {
    // return data
    } else if (char == "\"" || char == "\'" || char == "<") {
    return attributeName
    } else {
    currentAttribute.name += char
    return attributeName
    }
    }

    function beforeAttributeValue(char) {
    if (char.match(/^[\t\n\f ]$/) || char == "/" || char == ">" || char == EOF) {
    return beforeAttributeValue
    } else if (char == "\"") {
    return doubleQuotedAttributeValue
    } else if (char == "\'") {
    return singleQuotedAttributeValue
    } else if (char == ">") {
    emit(currentToken)
    // return data
    } else {
    return UnquotedAttributeValue(char)
    }
    }

    function doubleQuotedAttributeValue(char) {
    if (char == "\"") {
    currentToken[currentAttribute.name] = currentAttribute.value
    return afterQuotedAttributeValue
    } else if (char == "\u0000") {
    // return data
    } else if (char == EOF) {
    // return data
    } else {
    currentAttribute.value += char
    return doubleQuotedAttributeValue
    }
    }

    function singleQuotedAttributeValue(char) {
    if (char == "\'") {
    currentToken[currentAttribute.name] = currentAttribute.value
    return afterQuotedAttributeValue
    } else if (char == "\u0000") {
    // return data
    } else if (char == EOF) {
    // return data
    } else {
    currentAttribute.value += char
    return singleQuotedAttributeValue
    }
    }

    function afterQuotedAttributeValue(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    return beforeAttributeName
    } else if (char == "/") {
    return selfClosingStartTag
    } else if (char ==">") {
    currentToken[currentAttribute.name] = currentAttribute.value
    emit(currentToken)
    return data
    } else if (char == EOF) {
    // return data
    } else {
    // return data
    }
    }

    function UnquotedAttributeValue(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    currentToken[currentAttribute.name] = currentAttribute.value
    // emit(currentToken)
    return beforeAttributeName
    } else if (char == "/") {
    currentToken[currentAttribute.name] = currentAttribute.value
    // emit(currentToken)
    return selfClosingStartTag
    } else if (char == ">") {
    currentToken[currentAttribute.name] = currentAttribute.value
    emit(currentToken)
    return data
    } else if (char == "\u0000") {
    // return data
    } else if (char == "\"" || char == "\'" || char == "<" || char == "=" || char == "`") {
    // return data
    } else if (char == EOF) {
    // return data
    } else {
    currentAttribute.value += char
    return UnquotedAttributeValue
    }
    }


    function selfClosingStartTag(char) {
    if (char == ">" || char == "/") {
    currentToken.isSelfClosing = true
    currentToken.type = "selfClosingTag"
    emit(currentToken)
    return data
    } else if (char == "EOF") {
    // return data
    } else {
    // return data
    }
    }

    module.exports.parseHTML = function parseHTML(html){

    let state = data

    for (let char of html) {
    state = state(char)
    // console.log(state)
    }

    state = state(EOF)

    }
  • 运行结果

    • 第五步运行结果

第六步:构建 DOM 树

  • 从标签创建 DOM 树的基本技巧是使用栈

  • 遇到开始标签时创建元素并入栈,遇到结束标签时出栈

  • 自封闭节点可视为入栈后立刻出栈

  • 任何元素的父元素是它入栈前的栈顶

  • parseHTML6.js

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    230
    231
    232
    233
    234
    235
    236
    237
    238
    239
    240
    241
    242
    243
    244
    245
    246
    247
    248
    249
    250
    251
    252
    253
    254
    255
    256
    257
    258
    259
    260
    261
    262
    263
    264
    265
    266
    267
    268
    269
    270
    271
    // 处理属性 constructTree

    let currentToken = null
    let currentAttribute = null

    let stack = [{type: "document", children: []}]

    function emit(token) {
    if (token.type == "text")
    return

    let top = stack[stack.length - 1]


    if (token.type == "startTag") {
    let element = {
    type: "element",
    children: [],
    attributes: []
    }

    element.tagName = token.tagName

    for (let p in token) {
    if (p != "type" && p != "tagName") {
    element.attributes.push({
    name: p,
    value: token[p]
    })
    }
    }

    top.children.push(element)
    element.parent = top

    if (!token.isSelfClosing)
    stack.push(element)

    console.log('push', element)

    } else if (token.type == "endTag") {
    if (top.tagName != token.tagName) {
    throw new Error("Tag start end doesn't match")
    } else {
    console.log('pop', stack.pop())
    }
    }
    }

    const EOF = Symbol("EOF")

    function data(char) {
    if (char == "<") {
    return tagOpen
    } else if (char == EOF) {
    emit({
    type: "EOF"
    })
    return
    } else {
    emit({
    type: "text",
    content: char
    })
    return data
    }
    }


    // 1. 开始标签
    // 2. 结束标签
    // 3. 自封闭标签
    function tagOpen(char) {
    if (char == "/") { // 结束标签
    return endTagOpen
    } else if (char.match(/^[a-zA-Z]$/)) { // 开始标签
    currentToken = {
    type: "startTag",
    tagName: ""
    }
    return tagName(char)
    } else {
    // return data
    }
    }


    function endTagOpen(char) {
    if (char.match(/^[a-zA-Z]$/)) {
    currentToken = {
    type: "endTag",
    tagName: ""
    }
    return tagName(char)
    } else if (char == ">") {
    // return data
    } else if(char == EOF) {
    // return data
    }
    }


    function tagName(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    return beforeAttributeName(char)
    } else if (char == "/") {
    return selfClosingStartTag
    } else if (char.match(/^[a-zA-Z]$/)) {
    currentToken.tagName += char.toLowerCase()
    return tagName
    } else if (char == ">") {
    emit(currentToken)
    return data
    } else {
    return tagName
    }
    }


    function beforeAttributeName(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    return beforeAttributeName
    } else if (char == ">" || char == "/" || char == EOF) {
    return afterAttributeName(char)
    } else if (char == "=") {
    return
    } else {
    currentAttribute = {
    name: "",
    value: ""
    }
    return attributeName(char)
    }
    }

    function afterAttributeName(char) {
    if (char == "/") {
    return selfClosingStartTag
    } else if (char == EOF) {
    return
    } else {
    emit(currentToken)
    return data
    }
    }

    function attributeName(char) {
    if (char.match(/^[\t\n\f ]$/) || char == "/" || char == ">" || char == EOF) {
    return afterAttributeName(char)
    } else if (char == "=") {
    return beforeAttributeValue
    } else if (char == "\u0000") {
    // return data
    } else if (char == "\"" || char == "\'" || char == "<") {
    return attributeName
    } else {
    currentAttribute.name += char
    return attributeName
    }
    }

    function beforeAttributeValue(char) {
    if (char.match(/^[\t\n\f ]$/) || char == "/" || char == ">" || char == EOF) {
    return beforeAttributeValue
    } else if (char == "\"") {
    return doubleQuotedAttributeValue
    } else if (char == "\'") {
    return singleQuotedAttributeValue
    } else if (char == ">") {
    emit(currentToken)
    // return data
    } else {
    return UnquotedAttributeValue(char)
    }
    }

    function doubleQuotedAttributeValue(char) {
    if (char == "\"") {
    currentToken[currentAttribute.name] = currentAttribute.value
    return afterQuotedAttributeValue
    } else if (char == "\u0000") {
    // return data
    } else if (char == EOF) {
    // return data
    } else {
    currentAttribute.value += char
    return doubleQuotedAttributeValue
    }
    }

    function singleQuotedAttributeValue(char) {
    if (char == "\'") {
    currentToken[currentAttribute.name] = currentAttribute.value
    return afterQuotedAttributeValue
    } else if (char == "\u0000") {
    // return data
    } else if (char == EOF) {
    // return data
    } else {
    currentAttribute.value += char
    return singleQuotedAttributeValue
    }
    }

    function afterQuotedAttributeValue(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    return beforeAttributeName
    } else if (char == "/") {
    return selfClosingStartTag
    } else if (char ==">") {
    currentToken[currentAttribute.name] = currentAttribute.value
    emit(currentToken)
    return data
    } else if (char == EOF) {
    // return data
    } else {
    // return data
    }
    }

    function UnquotedAttributeValue(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    currentToken[currentAttribute.name] = currentAttribute.value
    // emit(currentToken)
    return beforeAttributeName
    } else if (char == "/") {
    currentToken[currentAttribute.name] = currentAttribute.value
    // emit(currentToken)
    return selfClosingStartTag
    } else if (char == ">") {
    currentToken[currentAttribute.name] = currentAttribute.value
    emit(currentToken)
    return data
    } else if (char == "\u0000") {
    // return data
    } else if (char == "\"" || char == "\'" || char == "<" || char == "=" || char == "`") {
    // return data
    } else if (char == EOF) {
    // return data
    } else {
    currentAttribute.value += char
    return UnquotedAttributeValue
    }
    }


    function selfClosingStartTag(char) {
    if (char == ">" || char == "/") {
    currentToken.isSelfClosing = true
    emit(currentToken)
    return data
    } else if (char == "EOF") {
    // return data
    } else {
    // return data
    }
    }

    module.exports.parseHTML = function parseHTML(html){

    let state = data

    for (let char of html) {
    state = state(char)
    // console.log(state)
    }

    state = state(EOF)

    console.log(stack)
    }
  • 运行结果:这边我在每次对栈操作时(push/pop)时,添加了 console

    • 第六步运行结果
  • 这边推荐使用 vs-code debugger 使用方法

    • 第六步运行结果 vscode debugger

第七步:文本节点

  • 文本节点与自封闭标签处理类似

  • 多个文本节点需要合并

  • parseHTML7.js

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    230
    231
    232
    233
    234
    235
    236
    237
    238
    239
    240
    241
    242
    243
    244
    245
    246
    247
    248
    249
    250
    251
    252
    253
    254
    255
    256
    257
    258
    259
    260
    261
    262
    263
    264
    265
    266
    267
    268
    269
    270
    271
    272
    273
    274
    275
    276
    277
    278
    279
    // 处理文本节点 combineText

    let currentToken = null
    let currentAttribute = null
    let currentTextNode = null

    let stack = [{type: "document", children: []}]

    function emit(token) {

    let top = stack[stack.length - 1]

    if (token.type == "startTag") {
    let element = {
    type: "element",
    children: [],
    attributes: []
    }

    element.tagName = token.tagName

    for (let p in token) {
    if (p != "type" && p != "tagName") {
    element.attributes.push({
    name: p,
    value: token[p]
    })
    }
    }

    top.children.push(element)
    element.parent = top

    if (!token.isSelfClosing)
    stack.push(element)

    currentTextNode = null
    // console.log('push', element)
    } else if (token.type == "endTag") {
    if (top.tagName != token.tagName) {
    throw new Error("Tag start end doesn't match")
    } else {
    // console.log('pop', stack.pop())
    stack.pop()
    }
    currentTextNode = null
    } else if (token.type == "text") {
    if (currentTextNode == null) {
    currentTextNode = {
    type: "text",
    content: ""
    }
    top.children.push(currentTextNode)
    }
    currentTextNode.content += token.content
    // console.log(top.children)
    }
    }

    const EOF = Symbol("EOF")

    function data(char) {
    if (char == "<") {
    return tagOpen
    } else if (char == EOF) {
    emit({
    type: "EOF"
    })
    return
    } else {
    emit({
    type: "text",
    content: char
    })
    return data
    }
    }


    // 1. 开始标签
    // 2. 结束标签
    // 3. 自封闭标签
    function tagOpen(char) {
    if (char == "/") { // 结束标签
    return endTagOpen
    } else if (char.match(/^[a-zA-Z]$/)) { // 开始标签
    currentToken = {
    type: "startTag",
    tagName: ""
    }
    return tagName(char)
    } else {
    // return data
    }
    }


    function endTagOpen(char) {
    if (char.match(/^[a-zA-Z]$/)) {
    currentToken = {
    type: "endTag",
    tagName: ""
    }
    return tagName(char)
    } else if (char == ">") {
    // return data
    } else if(char == EOF) {
    // return data
    }
    }


    function tagName(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    return beforeAttributeName(char)
    } else if (char == "/") {
    return selfClosingStartTag
    } else if (char.match(/^[a-zA-Z]$/)) {
    currentToken.tagName += char.toLowerCase()
    return tagName
    } else if (char == ">") {
    emit(currentToken)
    return data
    } else {
    return tagName
    }
    }


    function beforeAttributeName(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    return beforeAttributeName
    } else if (char == ">" || char == "/" || char == EOF) {
    return afterAttributeName(char)
    } else if (char == "=") {
    return
    } else {
    currentAttribute = {
    name: "",
    value: ""
    }
    return attributeName(char)
    }
    }

    function afterAttributeName(char) {
    if (char == "/") {
    return selfClosingStartTag
    } else if (char == EOF) {
    return
    } else {
    emit(currentToken)
    return data
    }
    }

    function attributeName(char) {
    if (char.match(/^[\t\n\f ]$/) || char == "/" || char == ">" || char == EOF) {
    return afterAttributeName(char)
    } else if (char == "=") {
    return beforeAttributeValue
    } else if (char == "\u0000") {
    // return data
    } else if (char == "\"" || char == "\'" || char == "<") {
    return attributeName
    } else {
    currentAttribute.name += char
    return attributeName
    }
    }

    function beforeAttributeValue(char) {
    if (char.match(/^[\t\n\f ]$/) || char == "/" || char == ">" || char == EOF) {
    return beforeAttributeValue
    } else if (char == "\"") {
    return doubleQuotedAttributeValue
    } else if (char == "\'") {
    return singleQuotedAttributeValue
    } else if (char == ">") {
    emit(currentToken)
    // return data
    } else {
    return UnquotedAttributeValue(char)
    }
    }

    function doubleQuotedAttributeValue(char) {
    if (char == "\"") {
    currentToken[currentAttribute.name] = currentAttribute.value
    return afterQuotedAttributeValue
    } else if (char == "\u0000") {
    // return data
    } else if (char == EOF) {
    // return data
    } else {
    currentAttribute.value += char
    return doubleQuotedAttributeValue
    }
    }

    function singleQuotedAttributeValue(char) {
    if (char == "\'") {
    currentToken[currentAttribute.name] = currentAttribute.value
    return afterQuotedAttributeValue
    } else if (char == "\u0000") {
    // return data
    } else if (char == EOF) {
    // return data
    } else {
    currentAttribute.value += char
    return singleQuotedAttributeValue
    }
    }

    function afterQuotedAttributeValue(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    return beforeAttributeName
    } else if (char == "/") {
    return selfClosingStartTag
    } else if (char ==">") {
    currentToken[currentAttribute.name] = currentAttribute.value
    emit(currentToken)
    return data
    } else if (char == EOF) {
    // return data
    } else {
    // return data
    }
    }

    function UnquotedAttributeValue(char) {
    if (char.match(/^[\t\n\f ]$/)) {
    currentToken[currentAttribute.name] = currentAttribute.value
    // emit(currentToken)
    return beforeAttributeName
    } else if (char == "/") {
    currentToken[currentAttribute.name] = currentAttribute.value
    // emit(currentToken)
    return selfClosingStartTag
    } else if (char == ">") {
    currentToken[currentAttribute.name] = currentAttribute.value
    emit(currentToken)
    return data
    } else if (char == "\u0000") {
    // return data
    } else if (char == "\"" || char == "\'" || char == "<" || char == "=" || char == "`") {
    // return data
    } else if (char == EOF) {
    // return data
    } else {
    currentAttribute.value += char
    return UnquotedAttributeValue
    }
    }


    function selfClosingStartTag(char) {
    if (char == ">" || char == "/") {
    currentToken.isSelfClosing = true
    emit(currentToken)
    return data
    } else if (char == "EOF") {
    // return data
    } else {
    // return data
    }
    }

    module.exports.parseHTML = function parseHTML(html){

    let state = data

    for (let char of html) {
    state = state(char)
    }

    state = state(EOF)

    }
  • 运行结果

    • 第六步运行结果 vscode debugger

参考文献

写在后面