1 #!/usr/bin/env python 2 3 """ 4 An absurdly minimal HTML tokeniser. 5 6 Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import deque 23 from moinformat.utils.htmlparse.lex import Lexer 24 25 26 27 # Document token classes. 28 29 class Token: 30 def __init__(self, value): 31 self.value = value 32 33 def __repr__(self): 34 return "%s(%r)" % (self.__class__.__name__, self.value) 35 36 class Attribute(Token): 37 def visit(self, visitor): 38 return visitor.attribute(self) 39 40 class AttributeValue(Token): 41 def __init__(self, value, quote): 42 self.value = value 43 self.quote = quote 44 45 def __repr__(self): 46 return "%s(%r, %r)" % (self.__class__.__name__, self.value, self.quote) 47 48 def visit(self, visitor): 49 return visitor.attribute_value(self) 50 51 class Comment(Token): 52 def visit(self, visitor): 53 return visitor.comment(self) 54 55 class Directive(Token): 56 def visit(self, visitor): 57 return visitor.directive(self) 58 59 class Inclusion(Token): 60 def visit(self, visitor): 61 return visitor.inclusion(self) 62 63 class Tag(Token): 64 def visit(self, visitor): 65 return visitor.tag(self) 66 67 def is_end(self): 68 return self.value.startswith("/") 69 70 def tag_name(self): 71 return self.is_end() and self.value[1:] or self.value 72 73 class TagClose: 74 def visit(self, visitor): 75 return visitor.tag_close(self) 76 77 def __repr__(self): 78 return "%s()" % self.__class__.__name__ 79 80 class Text(Token): 81 def visit(self, visitor): 82 return visitor.text(self) 83 84 85 86 # Tidying visitor employing the spans from lexical partitioning. 87 88 class Visitor: 89 def __init__(self): 90 self.queued = deque() 91 92 def visit(self, span): 93 return span.visit(self) 94 95 # Specific handler methods. 96 97 def between_tags(self, span): 98 return Text(span.text) 99 100 def in_comment(self, span): 101 return Comment(span.text) 102 103 def in_directive(self, span): 104 return Directive(span.text) 105 106 def in_inclusion(self, span): 107 return Inclusion(span.text) 108 109 def _queue_attributes(self, tokens): 110 for token in tokens: 111 self.queued.append(Attribute(token)) 112 113 def in_tag(self, span): 114 tokens = span.text.split() 115 self._queue_attributes(tokens[1:]) 116 return Tag(tokens[0]) 117 118 def at_end_of_tag(self, span): 119 return TagClose() 120 121 def after_attribute_value(self, span): 122 tokens = span.text.split() 123 self._queue_attributes(tokens) 124 return self.queued.popleft() 125 126 def at_attribute_value(self, span): 127 tokens = span.text.split() 128 self._queue_attributes(tokens[1:]) 129 return AttributeValue(tokens[0], "") 130 131 def in_dq_attribute_value(self, span): 132 return AttributeValue(span.text, '"') 133 134 def in_sq_attribute_value(self, span): 135 return AttributeValue(span.text, "'") 136 137 138 139 # Tokenising. 140 141 class Tokeniser: 142 def __init__(self, text): 143 self.lexer = Lexer(text) 144 self.visitor = Visitor() 145 146 def __iter__(self): 147 return self 148 149 def next(self): 150 if self.visitor.queued: 151 return self.visitor.queued.popleft() 152 153 while 1: 154 span = self.lexer.next() 155 if not span.empty(): 156 break 157 158 return self.visitor.visit(span) 159 160 # vim: tabstop=4 expandtab shiftwidth=4