1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/pyparser/pytokenizer.py	Sun Jan 08 20:20:39 2017 +0100
     1.3 @@ -0,0 +1,273 @@
     1.4 +from pyparser import automata
     1.5 +from pyparser.pygram import tokens
     1.6 +from pyparser.pytoken import python_opmap
     1.7 +from pyparser.error import TokenError, TokenIndentationError
     1.8 +from pyparser.pytokenize import tabsize, whiteSpaceDFA, \
     1.9 +    triple_quoted, endDFAs, single_quoted, pseudoDFA
    1.10 +from pyparser import consts
    1.11 +
    1.12 +NAMECHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
    1.13 +NUMCHARS = '0123456789'
    1.14 +ALNUMCHARS = NAMECHARS + NUMCHARS
    1.15 +EXTENDED_ALNUMCHARS = ALNUMCHARS + '-.'
    1.16 +WHITESPACES = ' \t\n\r\v\f'
    1.17 +
    1.18 +def match_encoding_declaration(comment):
    1.19 +    """returns the declared encoding or None
    1.20 +
    1.21 +    This function is a replacement for :
    1.22 +    >>> py_encoding = re.compile(r"coding[:=]\s*([-\w.]+)")
    1.23 +    >>> py_encoding.search(comment)
    1.24 +    """
    1.25 +    index = comment.find('coding')
    1.26 +    if index < 0:
    1.27 +        return None
    1.28 +    next_char = comment[index + 6]
    1.29 +    if next_char not in ':=':
    1.30 +        return None
    1.31 +    end_of_decl = comment[index + 7:]
    1.32 +    index = 0
    1.33 +    for char in end_of_decl:
    1.34 +        if char not in WHITESPACES:
    1.35 +            break
    1.36 +        index += 1
    1.37 +    else:
    1.38 +        return None
    1.39 +    encoding = ''
    1.40 +    for char in end_of_decl[index:]:
    1.41 +        if char in EXTENDED_ALNUMCHARS:
    1.42 +            encoding += char
    1.43 +        else:
    1.44 +            break
    1.45 +    if encoding != '':
    1.46 +        return encoding
    1.47 +    return None
    1.48 +
    1.49 +
    1.50 +DUMMY_DFA = automata.DFA([], [])
    1.51 +
    1.52 +def generate_tokens(lines, flags):
    1.53 +    """
    1.54 +    This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since
    1.55 +    the original function is not RPYTHON (uses yield)
    1.56 +    It was also slightly modified to generate Token instances instead
    1.57 +    of the original 5-tuples -- it's now a 4-tuple of
    1.58 +
    1.59 +    * the Token instance
    1.60 +    * the whole line as a string
    1.61 +    * the line number (the real one, counting continuation lines)
    1.62 +    * the position on the line of the end of the token.
    1.63 +
    1.64 +    Original docstring ::
    1.65 +
    1.66 +        The generate_tokens() generator requires one argment, readline, which
    1.67 +        must be a callable object which provides the same interface as the
    1.68 +        readline() method of built-in file objects. Each call to the function
    1.69 +        should return one line of input as a string.
    1.70 +
    1.71 +        The generator produces 5-tuples with these members: the token type; the
    1.72 +        token string; a 2-tuple (srow, scol) of ints specifying the row and
    1.73 +        column where the token begins in the source; a 2-tuple (erow, ecol) of
    1.74 +        ints specifying the row and column where the token ends in the source;
    1.75 +        and the line on which the token was found. The line passed is the
    1.76 +        logical line; continuation lines are included.
    1.77 +    """
    1.78 +    token_list = []
    1.79 +    lnum = parenlev = continued = 0
    1.80 +    namechars = NAMECHARS
    1.81 +    numchars = NUMCHARS
    1.82 +    contstr, needcont = '', 0
    1.83 +    contline = None
    1.84 +    indents = [0]
    1.85 +    last_comment = ''
    1.86 +    parenlevstart = (0, 0, "")
    1.87 +
    1.88 +    # make the annotator happy
    1.89 +    endDFA = DUMMY_DFA
    1.90 +    # make the annotator happy
    1.91 +    line = ''
    1.92 +    pos = 0
    1.93 +    lines.append("")
    1.94 +    strstart = (0, 0, "")
    1.95 +    for line in lines:
    1.96 +        lnum = lnum + 1
    1.97 +        line = universal_newline(line)
    1.98 +        pos, max = 0, len(line)
    1.99 +
   1.100 +        if contstr:
   1.101 +            if not line:
   1.102 +                raise TokenError(
   1.103 +                    "EOF while scanning triple-quoted string literal",
   1.104 +                    strstart[2], strstart[0], strstart[1]+1,
   1.105 +                    token_list, lnum-1)
   1.106 +            endmatch = endDFA.recognize(line)
   1.107 +            if endmatch >= 0:
   1.108 +                pos = end = endmatch
   1.109 +                tok = (tokens.STRING, contstr + line[:end], strstart[0],
   1.110 +                       strstart[1], line)
   1.111 +                token_list.append(tok)
   1.112 +                last_comment = ''
   1.113 +                contstr, needcont = '', 0
   1.114 +                contline = None
   1.115 +            elif (needcont and not line.endswith('\\\n') and
   1.116 +                               not line.endswith('\\\r\n')):
   1.117 +                tok = (tokens.ERRORTOKEN, contstr + line, strstart[0],
   1.118 +                       strstart[1], line)
   1.119 +                token_list.append(tok)
   1.120 +                last_comment = ''
   1.121 +                contstr = ''
   1.122 +                contline = None
   1.123 +                continue
   1.124 +            else:
   1.125 +                contstr = contstr + line
   1.126 +                contline = contline + line
   1.127 +                continue
   1.128 +
   1.129 +        elif parenlev == 0 and not continued:  # new statement
   1.130 +            if not line: break
   1.131 +            column = 0
   1.132 +            while pos < max:                   # measure leading whitespace
   1.133 +                if line[pos] == ' ': column = column + 1
   1.134 +                elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
   1.135 +                elif line[pos] == '\f': column = 0
   1.136 +                else: break
   1.137 +                pos = pos + 1
   1.138 +            if pos == max: break
   1.139 +
   1.140 +            if line[pos] in '#\r\n':
   1.141 +                # skip comments or blank lines
   1.142 +                continue
   1.143 +
   1.144 +            if column > indents[-1]:           # count indents or dedents
   1.145 +                indents.append(column)
   1.146 +                token_list.append((tokens.INDENT, line[:pos], lnum, 0, line))
   1.147 +                last_comment = ''
   1.148 +            while column < indents[-1]:
   1.149 +                indents = indents[:-1]
   1.150 +                token_list.append((tokens.DEDENT, '', lnum, pos, line))
   1.151 +                last_comment = ''
   1.152 +            if column != indents[-1]:
   1.153 +                err = "unindent does not match any outer indentation level"
   1.154 +                raise TokenIndentationError(err, line, lnum, 0, token_list)
   1.155 +
   1.156 +        else:                                  # continued statement
   1.157 +            if not line:
   1.158 +                if parenlev > 0:
   1.159 +                    lnum1, start1, line1 = parenlevstart
   1.160 +                    raise TokenError("parenthesis is never closed", line1,
   1.161 +                                     lnum1, start1 + 1, token_list, lnum)
   1.162 +                raise TokenError("EOF in multi-line statement", line,
   1.163 +                                 lnum, 0, token_list)
   1.164 +            continued = 0
   1.165 +
   1.166 +        while pos < max:
   1.167 +            pseudomatch = pseudoDFA.recognize(line, pos)
   1.168 +            if pseudomatch >= 0:                            # scan for tokens
   1.169 +                # JDR: Modified
   1.170 +                start = whiteSpaceDFA.recognize(line, pos)
   1.171 +                if start < 0:
   1.172 +                    start = pos
   1.173 +                end = pseudomatch
   1.174 +
   1.175 +                if start == end:
   1.176 +                    raise TokenError("Unknown character", line,
   1.177 +                                     lnum, start + 1, token_list)
   1.178 +
   1.179 +                pos = end
   1.180 +                token, initial = line[start:end], line[start]
   1.181 +                if initial in numchars or \
   1.182 +                   (initial == '.' and token != '.'):      # ordinary number
   1.183 +                    token_list.append((tokens.NUMBER, token, lnum, start, line))
   1.184 +                    last_comment = ''
   1.185 +                elif initial in '\r\n':
   1.186 +                    if parenlev <= 0:
   1.187 +                        tok = (tokens.NEWLINE, last_comment, lnum, start, line)
   1.188 +                        token_list.append(tok)
   1.189 +                    last_comment = ''
   1.190 +                elif initial == '#':
   1.191 +                    # skip comment
   1.192 +                    last_comment = token
   1.193 +                elif token in triple_quoted:
   1.194 +                    endDFA = endDFAs[token]
   1.195 +                    endmatch = endDFA.recognize(line, pos)
   1.196 +                    if endmatch >= 0:                     # all on one line
   1.197 +                        pos = endmatch
   1.198 +                        token = line[start:pos]
   1.199 +                        tok = (tokens.STRING, token, lnum, start, line)
   1.200 +                        token_list.append(tok)
   1.201 +                        last_comment = ''
   1.202 +                    else:
   1.203 +                        strstart = (lnum, start, line)
   1.204 +                        contstr = line[start:]
   1.205 +                        contline = line
   1.206 +                        break
   1.207 +                elif initial in single_quoted or \
   1.208 +                    token[:2] in single_quoted or \
   1.209 +                    token[:3] in single_quoted:
   1.210 +                    if token[-1] == '\n':                  # continued string
   1.211 +                        strstart = (lnum, start, line)
   1.212 +                        endDFA = (endDFAs[initial] or endDFAs[token[1]] or
   1.213 +                                   endDFAs[token[2]])
   1.214 +                        contstr, needcont = line[start:], 1
   1.215 +                        contline = line
   1.216 +                        break
   1.217 +                    else:                                  # ordinary string
   1.218 +                        tok = (tokens.STRING, token, lnum, start, line)
   1.219 +                        token_list.append(tok)
   1.220 +                        last_comment = ''
   1.221 +                elif initial in namechars:                 # ordinary name
   1.222 +                    token_list.append((tokens.NAME, token, lnum, start, line))
   1.223 +                    last_comment = ''
   1.224 +                elif initial == '\\':                      # continued stmt
   1.225 +                    continued = 1
   1.226 +                else:
   1.227 +                    if initial in '([{':
   1.228 +                        if parenlev == 0:
   1.229 +                            parenlevstart = (lnum, start, line)
   1.230 +                        parenlev = parenlev + 1
   1.231 +                    elif initial in ')]}':
   1.232 +                        parenlev = parenlev - 1
   1.233 +                        if parenlev < 0:
   1.234 +                            raise TokenError("unmatched '%s'" % initial, line,
   1.235 +                                             lnum, start + 1, token_list)
   1.236 +                    if token in python_opmap:
   1.237 +                        punct = python_opmap[token]
   1.238 +                    else:
   1.239 +                        punct = tokens.OP
   1.240 +                    token_list.append((punct, token, lnum, start, line))
   1.241 +                    last_comment = ''
   1.242 +            else:
   1.243 +                start = whiteSpaceDFA.recognize(line, pos)
   1.244 +                if start < 0:
   1.245 +                    start = pos
   1.246 +                if start<max and line[start] in single_quoted:
   1.247 +                    raise TokenError("EOL while scanning string literal",
   1.248 +                             line, lnum, start+1, token_list)
   1.249 +                tok = (tokens.ERRORTOKEN, line[pos], lnum, pos, line)
   1.250 +                token_list.append(tok)
   1.251 +                last_comment = ''
   1.252 +                pos = pos + 1
   1.253 +
   1.254 +    lnum -= 1
   1.255 +    if not (flags & consts.PyCF_DONT_IMPLY_DEDENT):
   1.256 +        if token_list and token_list[-1][0] != tokens.NEWLINE:
   1.257 +            tok = (tokens.NEWLINE, '', lnum, 0, '\n')
   1.258 +            token_list.append(tok)
   1.259 +        for indent in indents[1:]:                # pop remaining indent levels
   1.260 +            token_list.append((tokens.DEDENT, '', lnum, pos, line))
   1.261 +    tok = (tokens.NEWLINE, '', lnum, 0, '\n')
   1.262 +    token_list.append(tok)
   1.263 +
   1.264 +    token_list.append((tokens.ENDMARKER, '', lnum, pos, line))
   1.265 +    return token_list
   1.266 +
   1.267 +
   1.268 +def universal_newline(line):
   1.269 +    # show annotator that indexes below are non-negative
   1.270 +    line_len_m2 = len(line) - 2
   1.271 +    if line_len_m2 >= 0 and line[-2] == '\r' and line[-1] == '\n':
   1.272 +        return line[:line_len_m2] + '\n'
   1.273 +    line_len_m1 = len(line) - 1
   1.274 +    if line_len_m1 >= 0 and line[-1] == '\r':
   1.275 +        return line[:line_len_m1] + '\n'
   1.276 +    return line