paul@6 | 1 | # |
paul@6 | 2 | # Secret Labs' Regular Expression Engine |
paul@6 | 3 | # |
paul@6 | 4 | # various symbols used by the regular expression engine. |
paul@6 | 5 | # |
paul@6 | 6 | # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. |
paul@6 | 7 | # |
paul@6 | 8 | # See the sre.py file for information on usage and redistribution. |
paul@6 | 9 | # |
paul@6 | 10 | |
paul@6 | 11 | """Internal support module for sre""" |
paul@6 | 12 | |
paul@6 | 13 | # update when constants are added or removed |
paul@6 | 14 | |
paul@6 | 15 | MAGIC = 20031017 |
paul@6 | 16 | |
paul@6 | 17 | # max code word in this release |
paul@6 | 18 | |
paul@6 | 19 | MAXREPEAT = 65535 |
paul@6 | 20 | |
paul@6 | 21 | # SRE standard exception (access as sre.error) |
paul@6 | 22 | # should this really be here? |
paul@6 | 23 | |
paul@6 | 24 | class error(Exception): |
paul@6 | 25 | pass |
paul@6 | 26 | |
paul@6 | 27 | # operators |
paul@6 | 28 | |
paul@6 | 29 | FAILURE = "failure" |
paul@6 | 30 | SUCCESS = "success" |
paul@6 | 31 | |
paul@6 | 32 | ANY = "any" |
paul@6 | 33 | ANY_ALL = "any_all" |
paul@6 | 34 | ASSERT = "assert" |
paul@6 | 35 | ASSERT_NOT = "assert_not" |
paul@6 | 36 | AT = "at" |
paul@6 | 37 | BIGCHARSET = "bigcharset" |
paul@6 | 38 | BRANCH = "branch" |
paul@6 | 39 | CALL = "call" |
paul@6 | 40 | CATEGORY = "category" |
paul@6 | 41 | CHARSET = "charset" |
paul@6 | 42 | GROUPREF = "groupref" |
paul@6 | 43 | GROUPREF_IGNORE = "groupref_ignore" |
paul@6 | 44 | GROUPREF_EXISTS = "groupref_exists" |
paul@6 | 45 | IN = "in" |
paul@6 | 46 | IN_IGNORE = "in_ignore" |
paul@6 | 47 | INFO = "info" |
paul@6 | 48 | JUMP = "jump" |
paul@6 | 49 | LITERAL = "literal" |
paul@6 | 50 | LITERAL_IGNORE = "literal_ignore" |
paul@6 | 51 | MARK = "mark" |
paul@6 | 52 | MAX_REPEAT = "max_repeat" |
paul@6 | 53 | MAX_UNTIL = "max_until" |
paul@6 | 54 | MIN_REPEAT = "min_repeat" |
paul@6 | 55 | MIN_UNTIL = "min_until" |
paul@6 | 56 | NEGATE = "negate" |
paul@6 | 57 | NOT_LITERAL = "not_literal" |
paul@6 | 58 | NOT_LITERAL_IGNORE = "not_literal_ignore" |
paul@6 | 59 | RANGE = "range" |
paul@6 | 60 | REPEAT = "repeat" |
paul@6 | 61 | REPEAT_ONE = "repeat_one" |
paul@6 | 62 | SUBPATTERN = "subpattern" |
paul@6 | 63 | MIN_REPEAT_ONE = "min_repeat_one" |
paul@6 | 64 | |
paul@6 | 65 | # positions |
paul@6 | 66 | AT_BEGINNING = "at_beginning" |
paul@6 | 67 | AT_BEGINNING_LINE = "at_beginning_line" |
paul@6 | 68 | AT_BEGINNING_STRING = "at_beginning_string" |
paul@6 | 69 | AT_BOUNDARY = "at_boundary" |
paul@6 | 70 | AT_NON_BOUNDARY = "at_non_boundary" |
paul@6 | 71 | AT_END = "at_end" |
paul@6 | 72 | AT_END_LINE = "at_end_line" |
paul@6 | 73 | AT_END_STRING = "at_end_string" |
paul@6 | 74 | AT_LOC_BOUNDARY = "at_loc_boundary" |
paul@6 | 75 | AT_LOC_NON_BOUNDARY = "at_loc_non_boundary" |
paul@6 | 76 | AT_UNI_BOUNDARY = "at_uni_boundary" |
paul@6 | 77 | AT_UNI_NON_BOUNDARY = "at_uni_non_boundary" |
paul@6 | 78 | |
paul@6 | 79 | # categories |
paul@6 | 80 | CATEGORY_DIGIT = "category_digit" |
paul@6 | 81 | CATEGORY_NOT_DIGIT = "category_not_digit" |
paul@6 | 82 | CATEGORY_SPACE = "category_space" |
paul@6 | 83 | CATEGORY_NOT_SPACE = "category_not_space" |
paul@6 | 84 | CATEGORY_WORD = "category_word" |
paul@6 | 85 | CATEGORY_NOT_WORD = "category_not_word" |
paul@6 | 86 | CATEGORY_LINEBREAK = "category_linebreak" |
paul@6 | 87 | CATEGORY_NOT_LINEBREAK = "category_not_linebreak" |
paul@6 | 88 | CATEGORY_LOC_WORD = "category_loc_word" |
paul@6 | 89 | CATEGORY_LOC_NOT_WORD = "category_loc_not_word" |
paul@6 | 90 | CATEGORY_UNI_DIGIT = "category_uni_digit" |
paul@6 | 91 | CATEGORY_UNI_NOT_DIGIT = "category_uni_not_digit" |
paul@6 | 92 | CATEGORY_UNI_SPACE = "category_uni_space" |
paul@6 | 93 | CATEGORY_UNI_NOT_SPACE = "category_uni_not_space" |
paul@6 | 94 | CATEGORY_UNI_WORD = "category_uni_word" |
paul@6 | 95 | CATEGORY_UNI_NOT_WORD = "category_uni_not_word" |
paul@6 | 96 | CATEGORY_UNI_LINEBREAK = "category_uni_linebreak" |
paul@6 | 97 | CATEGORY_UNI_NOT_LINEBREAK = "category_uni_not_linebreak" |
paul@6 | 98 | |
paul@6 | 99 | OPCODES = [ |
paul@6 | 100 | |
paul@6 | 101 | # failure=0 success=1 (just because it looks better that way :-) |
paul@6 | 102 | FAILURE, SUCCESS, |
paul@6 | 103 | |
paul@6 | 104 | ANY, ANY_ALL, |
paul@6 | 105 | ASSERT, ASSERT_NOT, |
paul@6 | 106 | AT, |
paul@6 | 107 | BRANCH, |
paul@6 | 108 | CALL, |
paul@6 | 109 | CATEGORY, |
paul@6 | 110 | CHARSET, BIGCHARSET, |
paul@6 | 111 | GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE, |
paul@6 | 112 | IN, IN_IGNORE, |
paul@6 | 113 | INFO, |
paul@6 | 114 | JUMP, |
paul@6 | 115 | LITERAL, LITERAL_IGNORE, |
paul@6 | 116 | MARK, |
paul@6 | 117 | MAX_UNTIL, |
paul@6 | 118 | MIN_UNTIL, |
paul@6 | 119 | NOT_LITERAL, NOT_LITERAL_IGNORE, |
paul@6 | 120 | NEGATE, |
paul@6 | 121 | RANGE, |
paul@6 | 122 | REPEAT, |
paul@6 | 123 | REPEAT_ONE, |
paul@6 | 124 | SUBPATTERN, |
paul@6 | 125 | MIN_REPEAT_ONE |
paul@6 | 126 | |
paul@6 | 127 | ] |
paul@6 | 128 | |
paul@6 | 129 | ATCODES = [ |
paul@6 | 130 | AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY, |
paul@6 | 131 | AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING, |
paul@6 | 132 | AT_LOC_BOUNDARY, AT_LOC_NON_BOUNDARY, AT_UNI_BOUNDARY, |
paul@6 | 133 | AT_UNI_NON_BOUNDARY |
paul@6 | 134 | ] |
paul@6 | 135 | |
paul@6 | 136 | CHCODES = [ |
paul@6 | 137 | CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, CATEGORY_SPACE, |
paul@6 | 138 | CATEGORY_NOT_SPACE, CATEGORY_WORD, CATEGORY_NOT_WORD, |
paul@6 | 139 | CATEGORY_LINEBREAK, CATEGORY_NOT_LINEBREAK, CATEGORY_LOC_WORD, |
paul@6 | 140 | CATEGORY_LOC_NOT_WORD, CATEGORY_UNI_DIGIT, CATEGORY_UNI_NOT_DIGIT, |
paul@6 | 141 | CATEGORY_UNI_SPACE, CATEGORY_UNI_NOT_SPACE, CATEGORY_UNI_WORD, |
paul@6 | 142 | CATEGORY_UNI_NOT_WORD, CATEGORY_UNI_LINEBREAK, |
paul@6 | 143 | CATEGORY_UNI_NOT_LINEBREAK |
paul@6 | 144 | ] |
paul@6 | 145 | |
paul@6 | 146 | def makedict(list): |
paul@6 | 147 | d = {} |
paul@6 | 148 | i = 0 |
paul@6 | 149 | for item in list: |
paul@6 | 150 | d[item] = i |
paul@6 | 151 | i = i + 1 |
paul@6 | 152 | return d |
paul@6 | 153 | |
paul@6 | 154 | OPCODES = makedict(OPCODES) |
paul@6 | 155 | ATCODES = makedict(ATCODES) |
paul@6 | 156 | CHCODES = makedict(CHCODES) |
paul@6 | 157 | |
paul@6 | 158 | # replacement operations for "ignore case" mode |
paul@6 | 159 | OP_IGNORE = { |
paul@6 | 160 | GROUPREF: GROUPREF_IGNORE, |
paul@6 | 161 | IN: IN_IGNORE, |
paul@6 | 162 | LITERAL: LITERAL_IGNORE, |
paul@6 | 163 | NOT_LITERAL: NOT_LITERAL_IGNORE |
paul@6 | 164 | } |
paul@6 | 165 | |
paul@6 | 166 | AT_MULTILINE = { |
paul@6 | 167 | AT_BEGINNING: AT_BEGINNING_LINE, |
paul@6 | 168 | AT_END: AT_END_LINE |
paul@6 | 169 | } |
paul@6 | 170 | |
paul@6 | 171 | AT_LOCALE = { |
paul@6 | 172 | AT_BOUNDARY: AT_LOC_BOUNDARY, |
paul@6 | 173 | AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY |
paul@6 | 174 | } |
paul@6 | 175 | |
paul@6 | 176 | AT_UNICODE = { |
paul@6 | 177 | AT_BOUNDARY: AT_UNI_BOUNDARY, |
paul@6 | 178 | AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY |
paul@6 | 179 | } |
paul@6 | 180 | |
paul@6 | 181 | CH_LOCALE = { |
paul@6 | 182 | CATEGORY_DIGIT: CATEGORY_DIGIT, |
paul@6 | 183 | CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT, |
paul@6 | 184 | CATEGORY_SPACE: CATEGORY_SPACE, |
paul@6 | 185 | CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE, |
paul@6 | 186 | CATEGORY_WORD: CATEGORY_LOC_WORD, |
paul@6 | 187 | CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD, |
paul@6 | 188 | CATEGORY_LINEBREAK: CATEGORY_LINEBREAK, |
paul@6 | 189 | CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK |
paul@6 | 190 | } |
paul@6 | 191 | |
paul@6 | 192 | CH_UNICODE = { |
paul@6 | 193 | CATEGORY_DIGIT: CATEGORY_UNI_DIGIT, |
paul@6 | 194 | CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT, |
paul@6 | 195 | CATEGORY_SPACE: CATEGORY_UNI_SPACE, |
paul@6 | 196 | CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE, |
paul@6 | 197 | CATEGORY_WORD: CATEGORY_UNI_WORD, |
paul@6 | 198 | CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD, |
paul@6 | 199 | CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK, |
paul@6 | 200 | CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK |
paul@6 | 201 | } |
paul@6 | 202 | |
paul@6 | 203 | # flags |
paul@6 | 204 | SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking) |
paul@6 | 205 | SRE_FLAG_IGNORECASE = 2 # case insensitive |
paul@6 | 206 | SRE_FLAG_LOCALE = 4 # honour system locale |
paul@6 | 207 | SRE_FLAG_MULTILINE = 8 # treat target as multiline string |
paul@6 | 208 | SRE_FLAG_DOTALL = 16 # treat target as a single string |
paul@6 | 209 | SRE_FLAG_UNICODE = 32 # use unicode locale |
paul@6 | 210 | SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments |
paul@6 | 211 | SRE_FLAG_DEBUG = 128 # debugging |
paul@6 | 212 | |
paul@6 | 213 | # flags for INFO primitive |
paul@6 | 214 | SRE_INFO_PREFIX = 1 # has prefix |
paul@6 | 215 | SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix) |
paul@6 | 216 | SRE_INFO_CHARSET = 4 # pattern starts with character from given set |