MoinLight (file moinformat/utils/htmlparse/lex.py at f19281465a63)

     1 #!/usr/bin/env python     2      3 """     4 Lexical partitioning of HTML document content.     5      6 Copyright (C) 2023 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 # Lexical analysis state transition handler functions.    23     24 def tag_or_similar(text, pos):    25     26     # Consult the text positions following the position indicated.    27     28     if text[pos:pos+2] == "<!":    29         if text[pos+3:pos+4] == "[":    30             return IN_INCLUSION    31         elif text[pos+3:pos+5] == "--":    32             return IN_COMMENT    33         else:    34             return IN_DIRECTIVE    35     else:    36         return IN_TAG    37     38 def at_attribute_value(text, pos):    39     return AT_ATTRIBUTE_VALUE    40     41 def in_dq_attribute_value(text, pos):    42     return IN_DQ_ATTRIBUTE_VALUE    43     44 def in_sq_attribute_value(text, pos):    45     return IN_SQ_ATTRIBUTE_VALUE    46     47 def after_attribute_value(text, pos):    48     return AFTER_ATTRIBUTE_VALUE    49     50 def end_of_standalone_tag(text, pos):    51     return AT_END_OF_TAG    52     53 def end_of_tag(text, pos):    54     return BETWEEN_TAGS    55     56     57     58 # Lexical analysis states/spans.    59     60 class Span:    61     def __init__(self, text):    62         self.text = text    63     64     def empty(self):    65         return not self.text    66     67     def __repr__(self):    68         return "%s(%r)" % (self.__class__.__name__, self.text)    69     70 class AT_END_OF_TAG(Span):    71     transitions = [(None, "", end_of_tag)]    72     73     def empty(self):    74         return False    75     76     def visit(self, visitor):    77         return visitor.at_end_of_tag(self)    78     79 class BETWEEN_TAGS(Span):    80     transitions = [("<", "", tag_or_similar)]    81     82     def visit(self, visitor):    83         return visitor.between_tags(self)    84     85 class IN_TAG(Span):    86     transitions = [    87         ("=", "", at_attribute_value),    88         ("/>", "", end_of_standalone_tag),    89         (">", "", end_of_tag),    90         ]    91     92     def visit(self, visitor):    93         return visitor.in_tag(self)    94     95 class IN_COMMENT(Span):    96     transitions = [("-->", "--", end_of_tag)]    97     98     def visit(self, visitor):    99         return visitor.in_comment(self)   100    101 class IN_DIRECTIVE(Span):   102     transitions = [(">", "", end_of_tag)]   103    104     def visit(self, visitor):   105         return visitor.in_directive(self)   106    107 class IN_INCLUSION(Span):   108     transitions = [("]]>", "]]", end_of_tag)]   109    110     def visit(self, visitor):   111         return visitor.in_inclusion(self)   112    113 class AFTER_ATTRIBUTE_VALUE(Span):   114     transitions = [   115         ("=", "", at_attribute_value),   116         ("/>", "", end_of_standalone_tag),   117         (">", "", end_of_tag),   118         ]   119    120     def empty(self):   121         return not self.text.strip()   122    123     def visit(self, visitor):   124         return visitor.after_attribute_value(self)   125    126 class AT_ATTRIBUTE_VALUE(Span):   127     transitions = [   128         ("=", "", at_attribute_value),   129         ('"', "", in_dq_attribute_value),   130         ("'", "", in_sq_attribute_value),   131         ("/>", "", end_of_standalone_tag),   132         (">", "", end_of_tag),   133         ]   134    135     def empty(self):   136         return not self.text.strip()   137    138     def visit(self, visitor):   139         return visitor.at_attribute_value(self)   140    141 class IN_DQ_ATTRIBUTE_VALUE(Span):   142     transitions = [('"', "", after_attribute_value)]   143    144     def visit(self, visitor):   145         return visitor.in_dq_attribute_value(self)   146    147 class IN_SQ_ATTRIBUTE_VALUE(Span):   148     transitions = [("'", "", after_attribute_value)]   149    150     def visit(self, visitor):   151         return visitor.in_sq_attribute_value(self)   152    153    154    155 # Utility functions.   156    157 def find_one(text, pos, choices):   158    159     """   160     Find in 'text' from 'pos' the earliest occurring instance of one of the   161     given 'choices', these being a list of (token string, extra string, state)   162     tuples.   163    164     The token string is a token marking the start of the next span, the extra   165     string is the portion of the token to be added to the end of the current   166     span upon matching, and the state applies to the next span.   167    168     The associated state, the position of the occurrence, and the position of   169     the text following the occurrence are returned as a tuple.   170     """   171    172     next_state = None   173     first_pos = None   174     first_extra = None   175     next_pos = None   176    177     for token, extra, state in choices:   178         if token is None:   179             return state, pos, extra, pos   180    181         found_pos = text.find(token, pos)   182    183         if found_pos != -1 and (next_state is None or found_pos < first_pos):   184             next_state = state   185             first_pos = found_pos   186             first_extra = extra   187             next_pos = found_pos + len(token)   188    189     return next_state, first_pos, first_extra, next_pos   190    191    192    193 # Lexical partitioning.   194    195 class Lexer:   196     def __init__(self, text):   197         self.text = text   198         self.state = BETWEEN_TAGS   199         self.pos = 0   200    201     def _end_of_input(self):   202         start = self.pos   203         self.pos = None   204         return self._span(self.text[start:])   205    206     def _span(self, text):   207         return self.state(text)   208    209     def __iter__(self):   210         return self   211    212     def next(self):   213         if self.pos is None:   214             raise StopIteration   215    216         # Obtain details of a state transition: a handler function to determine   217         # the next state, and the start and end positions of the token causing   218         # the transition.   219    220         handler, pos, extra, next_pos = find_one(self.text, self.pos, self.state.transitions)   221    222         if handler is None:   223             return self._end_of_input()   224    225         # Obtain the lexical span and update the state and position.   226    227         span = self._span(self.text[self.pos:pos] + extra)   228    229         self.state = handler(self.text, pos)   230         self.pos = next_pos   231    232         return span   233    234 # vim: tabstop=4 expandtab shiftwidth=4