1 #!/usr/bin/env python 2 3 """ 4 Lexical partitioning of HTML document content. 5 6 Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 # Lexical analysis state transition handler functions. 23 24 def tag_or_similar(text, pos): 25 26 # Consult the text positions following the position indicated. 27 28 if text[pos:pos+2] == "<!": 29 if text[pos+3:pos+4] == "[": 30 return IN_INCLUSION 31 elif text[pos+3:pos+5] == "--": 32 return IN_COMMENT 33 else: 34 return IN_DIRECTIVE 35 else: 36 return IN_TAG 37 38 def at_attribute_value(text, pos): 39 return AT_ATTRIBUTE_VALUE 40 41 def in_dq_attribute_value(text, pos): 42 return IN_DQ_ATTRIBUTE_VALUE 43 44 def in_sq_attribute_value(text, pos): 45 return IN_SQ_ATTRIBUTE_VALUE 46 47 def after_attribute_value(text, pos): 48 return AFTER_ATTRIBUTE_VALUE 49 50 def end_of_standalone_tag(text, pos): 51 return AT_END_OF_TAG 52 53 def end_of_tag(text, pos): 54 return BETWEEN_TAGS 55 56 57 58 # Lexical analysis states/spans. 59 60 class Span: 61 def __init__(self, text): 62 self.text = text 63 64 def empty(self): 65 return not self.text 66 67 def __repr__(self): 68 return "%s(%r)" % (self.__class__.__name__, self.text) 69 70 class AT_END_OF_TAG(Span): 71 transitions = [(None, "", end_of_tag)] 72 73 def empty(self): 74 return False 75 76 def visit(self, visitor): 77 return visitor.at_end_of_tag(self) 78 79 class BETWEEN_TAGS(Span): 80 transitions = [("<", "", tag_or_similar)] 81 82 def visit(self, visitor): 83 return visitor.between_tags(self) 84 85 class IN_TAG(Span): 86 transitions = [ 87 ("=", "", at_attribute_value), 88 ("/>", "", end_of_standalone_tag), 89 (">", "", end_of_tag), 90 ] 91 92 def visit(self, visitor): 93 return visitor.in_tag(self) 94 95 class IN_COMMENT(Span): 96 transitions = [("-->", "--", end_of_tag)] 97 98 def visit(self, visitor): 99 return visitor.in_comment(self) 100 101 class IN_DIRECTIVE(Span): 102 transitions = [(">", "", end_of_tag)] 103 104 def visit(self, visitor): 105 return visitor.in_directive(self) 106 107 class IN_INCLUSION(Span): 108 transitions = [("]]>", "]]", end_of_tag)] 109 110 def visit(self, visitor): 111 return visitor.in_inclusion(self) 112 113 class AFTER_ATTRIBUTE_VALUE(Span): 114 transitions = [ 115 ("=", "", at_attribute_value), 116 ("/>", "", end_of_standalone_tag), 117 (">", "", end_of_tag), 118 ] 119 120 def empty(self): 121 return not self.text.strip() 122 123 def visit(self, visitor): 124 return visitor.after_attribute_value(self) 125 126 class AT_ATTRIBUTE_VALUE(Span): 127 transitions = [ 128 ("=", "", at_attribute_value), 129 ('"', "", in_dq_attribute_value), 130 ("'", "", in_sq_attribute_value), 131 ("/>", "", end_of_standalone_tag), 132 (">", "", end_of_tag), 133 ] 134 135 def empty(self): 136 return not self.text.strip() 137 138 def visit(self, visitor): 139 return visitor.at_attribute_value(self) 140 141 class IN_DQ_ATTRIBUTE_VALUE(Span): 142 transitions = [('"', "", after_attribute_value)] 143 144 def visit(self, visitor): 145 return visitor.in_dq_attribute_value(self) 146 147 class IN_SQ_ATTRIBUTE_VALUE(Span): 148 transitions = [("'", "", after_attribute_value)] 149 150 def visit(self, visitor): 151 return visitor.in_sq_attribute_value(self) 152 153 154 155 # Utility functions. 156 157 def find_one(text, pos, choices): 158 159 """ 160 Find in 'text' from 'pos' the earliest occurring instance of one of the 161 given 'choices', these being a list of (token string, extra string, state) 162 tuples. 163 164 The token string is a token marking the start of the next span, the extra 165 string is the portion of the token to be added to the end of the current 166 span upon matching, and the state applies to the next span. 167 168 The associated state, the position of the occurrence, and the position of 169 the text following the occurrence are returned as a tuple. 170 """ 171 172 next_state = None 173 first_pos = None 174 first_extra = None 175 next_pos = None 176 177 for token, extra, state in choices: 178 if token is None: 179 return state, pos, extra, pos 180 181 found_pos = text.find(token, pos) 182 183 if found_pos != -1 and (next_state is None or found_pos < first_pos): 184 next_state = state 185 first_pos = found_pos 186 first_extra = extra 187 next_pos = found_pos + len(token) 188 189 return next_state, first_pos, first_extra, next_pos 190 191 192 193 # Lexical partitioning. 194 195 class Lexer: 196 def __init__(self, text): 197 self.text = text 198 self.state = BETWEEN_TAGS 199 self.pos = 0 200 201 def _end_of_input(self): 202 start = self.pos 203 self.pos = None 204 return self._span(self.text[start:]) 205 206 def _span(self, text): 207 return self.state(text) 208 209 def __iter__(self): 210 return self 211 212 def next(self): 213 if self.pos is None: 214 raise StopIteration 215 216 # Obtain details of a state transition: a handler function to determine 217 # the next state, and the start and end positions of the token causing 218 # the transition. 219 220 handler, pos, extra, next_pos = find_one(self.text, self.pos, self.state.transitions) 221 222 if handler is None: 223 return self._end_of_input() 224 225 # Obtain the lexical span and update the state and position. 226 227 span = self._span(self.text[self.pos:pos] + extra) 228 229 self.state = handler(self.text, pos) 230 self.pos = next_pos 231 232 return span 233 234 # vim: tabstop=4 expandtab shiftwidth=4