1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/pyparser/pytokenizer.py Sun Jan 08 20:20:39 2017 +0100
1.3 @@ -0,0 +1,273 @@
1.4 +from pyparser import automata
1.5 +from pyparser.pygram import tokens
1.6 +from pyparser.pytoken import python_opmap
1.7 +from pyparser.error import TokenError, TokenIndentationError
1.8 +from pyparser.pytokenize import tabsize, whiteSpaceDFA, \
1.9 + triple_quoted, endDFAs, single_quoted, pseudoDFA
1.10 +from pyparser import consts
1.11 +
1.12 +NAMECHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
1.13 +NUMCHARS = '0123456789'
1.14 +ALNUMCHARS = NAMECHARS + NUMCHARS
1.15 +EXTENDED_ALNUMCHARS = ALNUMCHARS + '-.'
1.16 +WHITESPACES = ' \t\n\r\v\f'
1.17 +
1.18 +def match_encoding_declaration(comment):
1.19 + """returns the declared encoding or None
1.20 +
1.21 + This function is a replacement for :
1.22 + >>> py_encoding = re.compile(r"coding[:=]\s*([-\w.]+)")
1.23 + >>> py_encoding.search(comment)
1.24 + """
1.25 + index = comment.find('coding')
1.26 + if index < 0:
1.27 + return None
1.28 + next_char = comment[index + 6]
1.29 + if next_char not in ':=':
1.30 + return None
1.31 + end_of_decl = comment[index + 7:]
1.32 + index = 0
1.33 + for char in end_of_decl:
1.34 + if char not in WHITESPACES:
1.35 + break
1.36 + index += 1
1.37 + else:
1.38 + return None
1.39 + encoding = ''
1.40 + for char in end_of_decl[index:]:
1.41 + if char in EXTENDED_ALNUMCHARS:
1.42 + encoding += char
1.43 + else:
1.44 + break
1.45 + if encoding != '':
1.46 + return encoding
1.47 + return None
1.48 +
1.49 +
1.50 +DUMMY_DFA = automata.DFA([], [])
1.51 +
1.52 +def generate_tokens(lines, flags):
1.53 + """
1.54 + This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since
1.55 + the original function is not RPYTHON (uses yield)
1.56 + It was also slightly modified to generate Token instances instead
1.57 + of the original 5-tuples -- it's now a 4-tuple of
1.58 +
1.59 + * the Token instance
1.60 + * the whole line as a string
1.61 + * the line number (the real one, counting continuation lines)
1.62 + * the position on the line of the end of the token.
1.63 +
1.64 + Original docstring ::
1.65 +
1.66 + The generate_tokens() generator requires one argment, readline, which
1.67 + must be a callable object which provides the same interface as the
1.68 + readline() method of built-in file objects. Each call to the function
1.69 + should return one line of input as a string.
1.70 +
1.71 + The generator produces 5-tuples with these members: the token type; the
1.72 + token string; a 2-tuple (srow, scol) of ints specifying the row and
1.73 + column where the token begins in the source; a 2-tuple (erow, ecol) of
1.74 + ints specifying the row and column where the token ends in the source;
1.75 + and the line on which the token was found. The line passed is the
1.76 + logical line; continuation lines are included.
1.77 + """
1.78 + token_list = []
1.79 + lnum = parenlev = continued = 0
1.80 + namechars = NAMECHARS
1.81 + numchars = NUMCHARS
1.82 + contstr, needcont = '', 0
1.83 + contline = None
1.84 + indents = [0]
1.85 + last_comment = ''
1.86 + parenlevstart = (0, 0, "")
1.87 +
1.88 + # make the annotator happy
1.89 + endDFA = DUMMY_DFA
1.90 + # make the annotator happy
1.91 + line = ''
1.92 + pos = 0
1.93 + lines.append("")
1.94 + strstart = (0, 0, "")
1.95 + for line in lines:
1.96 + lnum = lnum + 1
1.97 + line = universal_newline(line)
1.98 + pos, max = 0, len(line)
1.99 +
1.100 + if contstr:
1.101 + if not line:
1.102 + raise TokenError(
1.103 + "EOF while scanning triple-quoted string literal",
1.104 + strstart[2], strstart[0], strstart[1]+1,
1.105 + token_list, lnum-1)
1.106 + endmatch = endDFA.recognize(line)
1.107 + if endmatch >= 0:
1.108 + pos = end = endmatch
1.109 + tok = (tokens.STRING, contstr + line[:end], strstart[0],
1.110 + strstart[1], line)
1.111 + token_list.append(tok)
1.112 + last_comment = ''
1.113 + contstr, needcont = '', 0
1.114 + contline = None
1.115 + elif (needcont and not line.endswith('\\\n') and
1.116 + not line.endswith('\\\r\n')):
1.117 + tok = (tokens.ERRORTOKEN, contstr + line, strstart[0],
1.118 + strstart[1], line)
1.119 + token_list.append(tok)
1.120 + last_comment = ''
1.121 + contstr = ''
1.122 + contline = None
1.123 + continue
1.124 + else:
1.125 + contstr = contstr + line
1.126 + contline = contline + line
1.127 + continue
1.128 +
1.129 + elif parenlev == 0 and not continued: # new statement
1.130 + if not line: break
1.131 + column = 0
1.132 + while pos < max: # measure leading whitespace
1.133 + if line[pos] == ' ': column = column + 1
1.134 + elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
1.135 + elif line[pos] == '\f': column = 0
1.136 + else: break
1.137 + pos = pos + 1
1.138 + if pos == max: break
1.139 +
1.140 + if line[pos] in '#\r\n':
1.141 + # skip comments or blank lines
1.142 + continue
1.143 +
1.144 + if column > indents[-1]: # count indents or dedents
1.145 + indents.append(column)
1.146 + token_list.append((tokens.INDENT, line[:pos], lnum, 0, line))
1.147 + last_comment = ''
1.148 + while column < indents[-1]:
1.149 + indents = indents[:-1]
1.150 + token_list.append((tokens.DEDENT, '', lnum, pos, line))
1.151 + last_comment = ''
1.152 + if column != indents[-1]:
1.153 + err = "unindent does not match any outer indentation level"
1.154 + raise TokenIndentationError(err, line, lnum, 0, token_list)
1.155 +
1.156 + else: # continued statement
1.157 + if not line:
1.158 + if parenlev > 0:
1.159 + lnum1, start1, line1 = parenlevstart
1.160 + raise TokenError("parenthesis is never closed", line1,
1.161 + lnum1, start1 + 1, token_list, lnum)
1.162 + raise TokenError("EOF in multi-line statement", line,
1.163 + lnum, 0, token_list)
1.164 + continued = 0
1.165 +
1.166 + while pos < max:
1.167 + pseudomatch = pseudoDFA.recognize(line, pos)
1.168 + if pseudomatch >= 0: # scan for tokens
1.169 + # JDR: Modified
1.170 + start = whiteSpaceDFA.recognize(line, pos)
1.171 + if start < 0:
1.172 + start = pos
1.173 + end = pseudomatch
1.174 +
1.175 + if start == end:
1.176 + raise TokenError("Unknown character", line,
1.177 + lnum, start + 1, token_list)
1.178 +
1.179 + pos = end
1.180 + token, initial = line[start:end], line[start]
1.181 + if initial in numchars or \
1.182 + (initial == '.' and token != '.'): # ordinary number
1.183 + token_list.append((tokens.NUMBER, token, lnum, start, line))
1.184 + last_comment = ''
1.185 + elif initial in '\r\n':
1.186 + if parenlev <= 0:
1.187 + tok = (tokens.NEWLINE, last_comment, lnum, start, line)
1.188 + token_list.append(tok)
1.189 + last_comment = ''
1.190 + elif initial == '#':
1.191 + # skip comment
1.192 + last_comment = token
1.193 + elif token in triple_quoted:
1.194 + endDFA = endDFAs[token]
1.195 + endmatch = endDFA.recognize(line, pos)
1.196 + if endmatch >= 0: # all on one line
1.197 + pos = endmatch
1.198 + token = line[start:pos]
1.199 + tok = (tokens.STRING, token, lnum, start, line)
1.200 + token_list.append(tok)
1.201 + last_comment = ''
1.202 + else:
1.203 + strstart = (lnum, start, line)
1.204 + contstr = line[start:]
1.205 + contline = line
1.206 + break
1.207 + elif initial in single_quoted or \
1.208 + token[:2] in single_quoted or \
1.209 + token[:3] in single_quoted:
1.210 + if token[-1] == '\n': # continued string
1.211 + strstart = (lnum, start, line)
1.212 + endDFA = (endDFAs[initial] or endDFAs[token[1]] or
1.213 + endDFAs[token[2]])
1.214 + contstr, needcont = line[start:], 1
1.215 + contline = line
1.216 + break
1.217 + else: # ordinary string
1.218 + tok = (tokens.STRING, token, lnum, start, line)
1.219 + token_list.append(tok)
1.220 + last_comment = ''
1.221 + elif initial in namechars: # ordinary name
1.222 + token_list.append((tokens.NAME, token, lnum, start, line))
1.223 + last_comment = ''
1.224 + elif initial == '\\': # continued stmt
1.225 + continued = 1
1.226 + else:
1.227 + if initial in '([{':
1.228 + if parenlev == 0:
1.229 + parenlevstart = (lnum, start, line)
1.230 + parenlev = parenlev + 1
1.231 + elif initial in ')]}':
1.232 + parenlev = parenlev - 1
1.233 + if parenlev < 0:
1.234 + raise TokenError("unmatched '%s'" % initial, line,
1.235 + lnum, start + 1, token_list)
1.236 + if token in python_opmap:
1.237 + punct = python_opmap[token]
1.238 + else:
1.239 + punct = tokens.OP
1.240 + token_list.append((punct, token, lnum, start, line))
1.241 + last_comment = ''
1.242 + else:
1.243 + start = whiteSpaceDFA.recognize(line, pos)
1.244 + if start < 0:
1.245 + start = pos
1.246 + if start<max and line[start] in single_quoted:
1.247 + raise TokenError("EOL while scanning string literal",
1.248 + line, lnum, start+1, token_list)
1.249 + tok = (tokens.ERRORTOKEN, line[pos], lnum, pos, line)
1.250 + token_list.append(tok)
1.251 + last_comment = ''
1.252 + pos = pos + 1
1.253 +
1.254 + lnum -= 1
1.255 + if not (flags & consts.PyCF_DONT_IMPLY_DEDENT):
1.256 + if token_list and token_list[-1][0] != tokens.NEWLINE:
1.257 + tok = (tokens.NEWLINE, '', lnum, 0, '\n')
1.258 + token_list.append(tok)
1.259 + for indent in indents[1:]: # pop remaining indent levels
1.260 + token_list.append((tokens.DEDENT, '', lnum, pos, line))
1.261 + tok = (tokens.NEWLINE, '', lnum, 0, '\n')
1.262 + token_list.append(tok)
1.263 +
1.264 + token_list.append((tokens.ENDMARKER, '', lnum, pos, line))
1.265 + return token_list
1.266 +
1.267 +
1.268 +def universal_newline(line):
1.269 + # show annotator that indexes below are non-negative
1.270 + line_len_m2 = len(line) - 2
1.271 + if line_len_m2 >= 0 and line[-2] == '\r' and line[-1] == '\n':
1.272 + return line[:line_len_m2] + '\n'
1.273 + line_len_m1 = len(line) - 1
1.274 + if line_len_m1 >= 0 and line[-1] == '\r':
1.275 + return line[:line_len_m1] + '\n'
1.276 + return line