1 #!/usr/bin/env python 2 3 """ 4 An absurdly minimal HTML parser. 5 6 Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.utils.htmlparse.token import Tokeniser 23 from moinformat.utils.htmlparse.tree import Attribute, AttributeValue, \ 24 Comment, Directive, Element, \ 25 Fragment, Inclusion, Node, Text 26 27 28 29 # Token processing employing the tokens from tokenisation. 30 31 class Visitor: 32 def __init__(self): 33 self.node = Fragment() 34 self.stack = [self.node] 35 36 def append(self, node): 37 self.node.nodes.append(node) 38 39 def push(self, node): 40 self.stack.append(node) 41 self.append(node) 42 self.node = node 43 44 def pop(self): 45 self.stack.pop() 46 self.node = self.stack[-1] 47 48 def visit(self, token): 49 token.visit(self) 50 51 # Specific handler methods. 52 53 def attribute(self, token): 54 if isinstance(self.node, Element): 55 self.node.attributes.append(Attribute(token.value)) 56 else: 57 raise ValueError, token 58 59 def attribute_value(self, token): 60 if isinstance(self.node, Element): 61 self.node.attributes[-1].value = AttributeValue(token.value, token.quote) 62 else: 63 raise ValueError, token 64 65 def comment(self, token): 66 self.append(Comment(token.value)) 67 68 def directive(self, token): 69 self.append(Directive(token.value)) 70 71 def inclusion(self, token): 72 self.append(Inclusion(token.value)) 73 74 def tag(self, token): 75 if not token.is_end(): 76 self.push(Element(token.tag_name())) 77 elif self.node.name == token.tag_name(): 78 self.pop() 79 else: 80 raise ValueError, token 81 82 def tag_close(self, token): 83 self.pop() 84 85 def text(self, token): 86 self.append(Text(token.value)) 87 88 89 90 # Parsing and document construction. 91 92 class Parser: 93 def __init__(self, text): 94 self.tokeniser = Tokeniser(text) 95 self.visitor = Visitor() 96 97 def __iter__(self): 98 return self 99 100 def next(self): 101 token = self.tokeniser.next() 102 self.visitor.visit(token) 103 104 def parse(self): 105 for _none in self: 106 pass 107 108 return self.visitor.node 109 110 # vim: tabstop=4 expandtab shiftwidth=4