Changeset 45
- Timestamp:
- 02/15/06 16:57:37 (7 years ago)
- Location:
- branches/pyyaml3000
- Files:
-
- 5 added
- 5 edited
-
lib/yaml/error.py (added)
-
lib/yaml/scanner.py (modified) (15 diffs)
-
lib/yaml/stream.py (modified) (1 diff)
-
tests/data/invalid-character.stream-error (added)
-
tests/data/invalid-utf8-byte.stream-error (added)
-
tests/data/odd-utf16.stream-error (added)
-
tests/test_appliance.py (modified) (2 diffs)
-
tests/test_marker.py (modified) (2 diffs)
-
tests/test_stream.py (added)
-
tests/test_yaml.py (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
branches/pyyaml3000/lib/yaml/scanner.py
r44 r45 125 125 # Stream supports the following methods 126 126 # self.stream.peek(k=1) # peek the next k characters 127 # self.stream. read(k=1) # read the next k characters and move the127 # self.stream.forward(k=1) # read the next k characters and move the 128 128 # # pointer 129 129 self.stream = Stream(source, data) … … 443 443 # Add DOCUMENT-START or DOCUMENT-END. 444 444 start_marker = self.stream.get_marker() 445 self.stream. read(3)445 self.stream.forward(3) 446 446 end_marker = self.stream.get_marker() 447 447 self.tokens.append(TokenClass(start_marker, end_marker)) … … 466 466 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START. 467 467 start_marker = self.stream.get_marker() 468 self.stream. read()468 self.stream.forward() 469 469 end_marker = self.stream.get_marker() 470 470 self.tokens.append(TokenClass(start_marker, end_marker)) … … 489 489 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END. 490 490 start_marker = self.stream.get_marker() 491 self.stream. read()491 self.stream.forward() 492 492 end_marker = self.stream.get_marker() 493 493 self.tokens.append(TokenClass(start_marker, end_marker)) … … 515 515 # Add ENTRY. 516 516 start_marker = self.stream.get_marker() 517 self.stream. read()517 self.stream.forward() 518 518 end_marker = self.stream.get_marker() 519 519 self.tokens.append(EntryToken(start_marker, end_marker)) … … 541 541 # Add KEY. 542 542 start_marker = self.stream.get_marker() 543 self.stream. read()543 self.stream.forward() 544 544 end_marker = self.stream.get_marker() 545 545 self.tokens.append(KeyToken(start_marker, end_marker)) … … 577 577 # Add VALUE. 578 578 start_marker = self.stream.get_marker() 579 self.stream. read()579 self.stream.forward() 580 580 end_marker = self.stream.get_marker() 581 581 self.tokens.append(ValueToken(start_marker, end_marker)) … … 728 728 while not found: 729 729 while self.stream.peek() == u' ': 730 self.stream. read()730 self.stream.forward() 731 731 if self.stream.peek() == u'#': 732 732 while self.stream.peek() not in u'\r\n': 733 self.stream. read()733 self.stream.forward() 734 734 if self.stream.peek() in u'\r\n': 735 self.stream. read()735 self.stream.forward() 736 736 if not self.flow_level: 737 737 self.allow_simple_key = True … … 748 748 self.tokens.append(ReservedDirectiveToken('', marker, marker)) 749 749 while self.stream.peek() not in u'\0\r\n': 750 self.stream. read()751 self.stream. read()750 self.stream.forward() 751 self.stream.forward() 752 752 753 753 def scan_anchor(self, TokenClass): 754 754 start_marker = self.stream.get_marker() 755 755 while self.stream.peek() not in u'\0 \t\r\n,:': 756 self.stream. read()756 self.stream.forward() 757 757 end_marker = self.stream.get_marker() 758 758 self.tokens.append(TokenClass('', start_marker, end_marker)) … … 761 761 start_marker = self.stream.get_marker() 762 762 while self.stream.peek() not in u'\0 \t\r\n': 763 self.stream. read()763 self.stream.forward() 764 764 end_marker = self.stream.get_marker() 765 765 self.tokens.append(TagToken('', start_marker, end_marker)) … … 772 772 while True: 773 773 while self.stream.peek() and self.stream.peek() and self.stream.peek() not in u'\0\r\n\x85\u2028\u2029': 774 self.stream. read()774 self.stream.forward() 775 775 if self.stream.peek() != u'\0': 776 self.stream. read()776 self.stream.forward() 777 777 count = 0 778 778 while count < indent and self.stream.peek() == u' ': 779 self.stream. read()779 self.stream.forward() 780 780 count += 1 781 781 if count < indent and self.stream.peek() not in u'#\r\n\x85\u2028\u2029': … … 785 785 def scan_flow_scalar(self, double): 786 786 marker = self.stream.get_marker() 787 quote = self.stream.read() 787 quote = self.stream.peek() 788 self.stream.forward() 788 789 while self.stream.peek() != quote: 789 790 if double and self.stream.peek() == u'\\': 790 self.stream. read(2)791 self.stream.forward(2) 791 792 elif not double and self.stream.peek(3)[1:] == u'\'\'': 792 self.stream. read(3)793 self.stream.forward(3) 793 794 else: 794 self.stream. read(1)795 self.stream. read(1)795 self.stream.forward(1) 796 self.stream.forward(1) 796 797 self.tokens.append(ScalarToken('', False, marker, marker)) 797 798 … … 804 805 while True: 805 806 while self.stream.peek() == u' ': 806 self.stream. read()807 self.stream.forward() 807 808 space = True 808 809 while self.stream.peek() not in u'\0\r\n?:,[]{}#' \ … … 811 812 or (not self.flow_level and self.stream.peek() == ':' and self.stream.peek(2)[1] not in u' \0\r\n'): 812 813 space = self.stream.peek() not in u' \t' 813 self.stream. read()814 self.stream.forward() 814 815 self.allow_simple_key = False 815 816 if self.stream.peek() not in u'\r\n': 816 817 break 817 818 while self.stream.peek() in u'\r\n': 818 self.stream. read()819 self.stream.forward() 819 820 if not self.flow_level: 820 821 self.allow_simple_key = True 821 822 count = 0 822 823 while self.stream.peek() == u' ' and count < indent: 823 self.stream. read()824 self.stream.forward() 824 825 count += 1 825 826 if count < indent: … … 834 835 raise ScannerError(message) 835 836 837 #try: 838 # import psyco 839 # psyco.bind(Scanner) 840 #except ImportError: 841 # pass 842 -
branches/pyyaml3000/lib/yaml/stream.py
r44 r45 1 2 from marker import Marker 1 # This module contains abstractions for the input stream. You don't have to 2 # looks further, there are no pretty code. 3 # 4 # We define two classes here. 5 # 6 # Marker(source, line, column) 7 # It's just a record and its only use is producing nice error messages. 8 # Parser does not use it for any other purposes. 9 # 10 # Stream(source, data) 11 # Stream determines the encoding of `data` and converts it to unicode. 12 # Stream provides the following methods and attributes: 13 # stream.peek(length=1) - return the next `length` characters 14 # stream.forward(length=1) - move the current position to `length` characters. 15 # stream.index - the number of the current character. 16 # stream.line, stream.column - the line and the column of the current character. 17 18 19 from error import YAMLError 20 21 import codecs, re 22 23 # Unfortunately, codec functions in Python 2.3 does not support the `finish` 24 # arguments, so we have to write our own wrappers. 25 26 try: 27 codecs.utf_8_decode('', 'strict', False) 28 from codecs import utf_8_decode, utf_16_le_decode, utf_16_be_decode 29 30 except TypeError: 31 32 def utf_16_le_decode(data, errors, finish=False): 33 if not finish and len(data) % 2 == 1: 34 data = data[:-1] 35 return codecs.utf_16_le_decode(data, errors) 36 37 def utf_16_be_decode(data, errors, finish=False): 38 if not finish and len(data) % 2 == 1: 39 data = data[:-1] 40 return codecs.utf_16_be_decode(data, errors) 41 42 def utf_8_decode(data, errors, finish=False): 43 if not finish: 44 # We are trying to remove a possible incomplete multibyte character 45 # from the suffix of the data. 46 # The first byte of a multi-byte sequence is in the range 0xc0 to 0xfd. 47 # All further bytes are in the range 0x80 to 0xbf. 48 # UTF-8 encoded UCS characters may be up to six bytes long. 49 count = 0 50 while count < 5 and count < len(data) \ 51 and '\x80' <= data[-count-1] <= '\xBF': 52 count -= 1 53 if count < 5 and count < len(data) \ 54 and '\xC0' <= data[-count-1] <= '\xFD': 55 data = data[:-count-1] 56 return codecs.utf_8_decode(data, errors) 57 58 class Marker: 59 60 def __init__(self, source, line, column, buffer, pointer): 61 self.source = source 62 self.line = line 63 self.column = column 64 self.buffer = buffer 65 self.pointer = pointer 66 67 def get_snippet(self, max_length=79): 68 if self.buffer is None: 69 return None 70 head = '' 71 start = self.pointer 72 while start > 0 and self.buffer[start-1] not in u'\0\r\n\x85\u2028\u2029': 73 start -= 1 74 if self.pointer-start > max_length/2-1: 75 head = ' ... ' 76 start += 5 77 break 78 tail = '' 79 end = self.pointer 80 while end < len(self.buffer) and self.buffer[end] not in u'\0\r\n\x85\u2028\u2029': 81 end += 1 82 if end-self.pointer > max_length/2-1: 83 tail = ' ... ' 84 end -= 5 85 break 86 snippet = self.buffer[start:end].encode('utf-8') 87 return head + snippet + tail + '\n' \ 88 + ' '*(self.pointer-start+len(head)) + '^' + '\n' 89 90 class StreamError(YAMLError): 91 92 def __init__(self, source, encoding, character, position, reason): 93 self.source = source 94 self.encoding = encoding 95 self.character = character 96 self.position = position 97 self.reason = reason 98 99 def __str__(self): 100 if isinstance(self.character, str): 101 return "'%s' codec can't decode byte #x%02x: %s\n" \ 102 "\tin file '%s', position %d." \ 103 % (self.encoding, ord(self.character), self.reason, 104 self.source, self.position) 105 else: 106 return "unacceptable character #x%04x: %s\n" \ 107 "\tin file '%s', position %d." \ 108 % (ord(self.character), self.reason, 109 self.source, self.position) 3 110 4 111 class Stream: 112 # Stream: 113 # - determines the data encoding and converts it to unicode, 114 # - checks if characters are in allowed range, 115 # - adds '\0' to the end. 116 117 # Yeah, it's ugly and slow. 5 118 6 119 def __init__(self, source, data): 7 120 self.source = source 8 self.data = unicode(data, 'utf-8')+u'\0' 121 self.stream = None 122 self.stream_pointer = 0 123 self.eof = True 124 self.buffer = u'' 125 self.pointer = 0 126 self.raw_buffer = None 127 self.raw_decoder = None 9 128 self.index = 0 10 129 self.line = 0 11 130 self.column = 0 12 13 def peek(self, k=1): 14 return self.data[self.index:self.index+k] 15 16 def read(self, k=1): 17 value = self.data[self.index:self.index+k] 18 for i in range(k): 19 if self.index >= len(self.data): 20 break 21 if self.data[self.index] in u'\r\n\x85\u2028\u2029': 131 if isinstance(data, unicode): 132 self.check_printable(data) 133 self.buffer = data+u'\0' 134 elif isinstance(data, str): 135 self.raw_buffer = data 136 self.determine_encoding() 137 else: 138 self.stream = data 139 self.eof = False 140 self.raw_buffer = '' 141 self.determine_encoding() 142 143 def peek(self, length=1): 144 if self.pointer+length >= len(self.buffer): 145 self.update(length) 146 return self.buffer[self.pointer:self.pointer+length] 147 148 def forward(self, length=1): 149 if self.pointer+length+1 >= len(self.buffer): 150 self.update(length+1) 151 for k in range(length): 152 ch = self.buffer[self.pointer] 153 self.pointer += 1 154 self.index += 1 155 if ch in u'\n\x85\u2028\u2029' \ 156 or (ch == u'\r' and self.buffer[self.pointer+1] != u'\n'): 22 157 self.line += 1 23 158 self.column = 0 24 el se:159 elif ch != u'\uFEFF': 25 160 self.column += 1 26 self.index += 127 return value28 161 29 162 def get_marker(self): 30 return Marker(self.source, self.data, self.index, self.line, self.column) 31 163 if self.stream is None: 164 return Marker(self.source, self.line, self.column, 165 self.buffer, self.pointer) 166 else: 167 return Marker(self.source, self.line, self.column, None, None) 168 169 def determine_encoding(self): 170 while not self.eof and len(self.raw_buffer) < 2: 171 self.update_raw() 172 if self.raw_buffer.startswith(codecs.BOM_UTF16_LE): 173 self.raw_decode = utf_16_le_decode 174 elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE): 175 self.raw_decode = utf_16_be_decode 176 else: 177 self.raw_decode = utf_8_decode 178 self.update(1) 179 180 NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]') 181 def check_printable(self, data): 182 match = self.NON_PRINTABLE.search(data) 183 if match: 184 character = match.group() 185 position = self.index+(len(self.buffer)-self.pointer)+match.start() 186 raise StreamError(self.source, 'unicode', character, position, 187 "control characters are not allowed") 188 189 def update(self, length): 190 if self.raw_buffer is None: 191 return 192 self.buffer = self.buffer[self.pointer:] 193 self.pointer = 0 194 while len(self.buffer) < length: 195 if not self.eof: 196 self.update_raw() 197 try: 198 data, converted = self.raw_decode(self.raw_buffer, 199 'strict', self.eof) 200 except UnicodeDecodeError, exc: 201 character = exc.object[exc.start] 202 if self.stream is not None: 203 position = self.stream_pointer-len(self.raw_buffer)+exc.start 204 else: 205 position = exc.start 206 raise StreamError(self.source, exc.encoding, 207 character, position, exc.reason) 208 self.check_printable(data) 209 self.buffer += data 210 self.raw_buffer = self.raw_buffer[converted:] 211 if self.eof: 212 self.buffer += u'\0' 213 self.raw_buffer = None 214 break 215 216 def update_raw(self, size=1024): 217 data = self.stream.read(size) 218 if data: 219 self.raw_buffer += data 220 self.stream_pointer += len(data) 221 else: 222 self.eof = True 223 224 #try: 225 # import psyco 226 # psyco.bind(Stream) 227 #except ImportError: 228 # pass 229 -
branches/pyyaml3000/tests/test_appliance.py
r44 r45 6 6 DATA = 'tests/data' 7 7 8 tests = {}8 all_tests = {} 9 9 for filename in os.listdir(DATA): 10 10 if os.path.isfile(os.path.join(DATA, filename)): 11 11 root, ext = os.path.splitext(filename) 12 tests.setdefault(root, []).append(ext)12 all_tests.setdefault(root, []).append(ext) 13 13 14 14 def add_tests(cls, method_name, *extensions): 15 for test in cls. tests:16 available_extensions = cls. tests[test]15 for test in cls.all_tests: 16 available_extensions = cls.all_tests[test] 17 17 for ext in extensions: 18 18 if ext not in available_extensions: … … 23 23 getattr(self, '_'+method_name)(test, *filenames) 24 24 test = test.replace('-', '_') 25 test_method.__name__ = '%s_%s' % (method_name, test) 25 try: 26 test_method.__name__ = '%s_%s' % (method_name, test) 27 except TypeError: 28 import new 29 test_method = new.function(test_method.func_code, test_method.func_globals, 30 '%s_%s' % (method_name, test), test_method.func_defaults, 31 test_method.func_closure) 26 32 setattr(cls, test_method.__name__, test_method) 27 33 add_tests = classmethod(add_tests) -
branches/pyyaml3000/tests/test_marker.py
r39 r45 2 2 import test_appliance 3 3 4 from yaml. markerimport Marker4 from yaml.stream import Marker 5 5 6 6 class TestMarker(test_appliance.TestAppliance): … … 19 19 column += 1 20 20 index += 1 21 for str_type in [str, unicode]: 22 marker = Marker(test_name, str_type(input), index, line, column) 23 snippet = marker.get_snippet() 24 #print "INPUT:" 25 #print input 26 #print "SNIPPET:" 27 #print snippet 28 self.failUnless(isinstance(snippet, str)) 29 self.failUnlessEqual(snippet.count('\n'), 2) 30 data, pointer, dummy = snippet.split('\n') 31 self.failUnless(len(data) < 80) 32 self.failUnlessEqual(data[len(pointer)-1], '*') 21 marker = Marker(test_name, line, column, unicode(input), index) 22 snippet = marker.get_snippet() 23 #print "INPUT:" 24 #print input 25 #print "SNIPPET:" 26 #print snippet 27 self.failUnless(isinstance(snippet, str)) 28 self.failUnlessEqual(snippet.count('\n'), 2) 29 data, pointer, dummy = snippet.split('\n') 30 self.failUnless(len(data) < 80) 31 self.failUnlessEqual(data[len(pointer)-1], '*') 33 32 34 33 TestMarker.add_tests('testMarkers', '.markers') -
branches/pyyaml3000/tests/test_yaml.py
r44 r45 3 3 4 4 from test_marker import * 5 from test_stream import * 5 6 from test_canonical import * 6 7 from test_tokens import *
Note: See TracChangeset
for help on using the changeset viewer.
