scanner.py 52 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444
  1. # Scanner produces tokens of the following types:
  2. # STREAM-START
  3. # STREAM-END
  4. # DIRECTIVE(name, value)
  5. # DOCUMENT-START
  6. # DOCUMENT-END
  7. # BLOCK-SEQUENCE-START
  8. # BLOCK-MAPPING-START
  9. # BLOCK-END
  10. # FLOW-SEQUENCE-START
  11. # FLOW-MAPPING-START
  12. # FLOW-SEQUENCE-END
  13. # FLOW-MAPPING-END
  14. # BLOCK-ENTRY
  15. # FLOW-ENTRY
  16. # KEY
  17. # VALUE
  18. # ALIAS(value)
  19. # ANCHOR(value)
  20. # TAG(value)
  21. # SCALAR(value, plain, style)
  22. #
  23. # Read comments in the Scanner code for more details.
  24. #
  25. __all__ = ['Scanner', 'ScannerError']
  26. from .error import MarkedYAMLError
  27. from .tokens import *
  28. class ScannerError(MarkedYAMLError):
  29. pass
  30. class SimpleKey:
  31. # See below simple keys treatment.
  32. def __init__(self, token_number, required, index, line, column, mark):
  33. self.token_number = token_number
  34. self.required = required
  35. self.index = index
  36. self.line = line
  37. self.column = column
  38. self.mark = mark
  39. class Scanner:
  40. def __init__(self):
  41. """Initialize the scanner."""
  42. # It is assumed that Scanner and Reader will have a common descendant.
  43. # Reader do the dirty work of checking for BOM and converting the
  44. # input data to Unicode. It also adds NUL to the end.
  45. #
  46. # Reader supports the following methods
  47. # self.peek(i=0) # peek the next i-th character
  48. # self.prefix(l=1) # peek the next l characters
  49. # self.forward(l=1) # read the next l characters and move the pointer.
  50. # Had we reached the end of the stream?
  51. self.done = False
  52. # The number of unclosed '{' and '['. `flow_level == 0` means block
  53. # context.
  54. self.flow_level = 0
  55. # List of processed tokens that are not yet emitted.
  56. self.tokens = []
  57. # Add the STREAM-START token.
  58. self.fetch_stream_start()
  59. # Number of tokens that were emitted through the `get_token` method.
  60. self.tokens_taken = 0
  61. # The current indentation level.
  62. self.indent = -1
  63. # Past indentation levels.
  64. self.indents = []
  65. # Variables related to simple keys treatment.
  66. # A simple key is a key that is not denoted by the '?' indicator.
  67. # Example of simple keys:
  68. # ---
  69. # block simple key: value
  70. # ? not a simple key:
  71. # : { flow simple key: value }
  72. # We emit the KEY token before all keys, so when we find a potential
  73. # simple key, we try to locate the corresponding ':' indicator.
  74. # Simple keys should be limited to a single line and 1024 characters.
  75. # Can a simple key start at the current position? A simple key may
  76. # start:
  77. # - at the beginning of the line, not counting indentation spaces
  78. # (in block context),
  79. # - after '{', '[', ',' (in the flow context),
  80. # - after '?', ':', '-' (in the block context).
  81. # In the block context, this flag also signifies if a block collection
  82. # may start at the current position.
  83. self.allow_simple_key = True
  84. # Keep track of possible simple keys. This is a dictionary. The key
  85. # is `flow_level`; there can be no more that one possible simple key
  86. # for each level. The value is a SimpleKey record:
  87. # (token_number, required, index, line, column, mark)
  88. # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
  89. # '[', or '{' tokens.
  90. self.possible_simple_keys = {}
  91. # Public methods.
  92. def check_token(self, *choices):
  93. # Check if the next token is one of the given types.
  94. while self.need_more_tokens():
  95. self.fetch_more_tokens()
  96. if self.tokens:
  97. if not choices:
  98. return True
  99. for choice in choices:
  100. if isinstance(self.tokens[0], choice):
  101. return True
  102. return False
  103. def peek_token(self):
  104. # Return the next token, but do not delete if from the queue.
  105. while self.need_more_tokens():
  106. self.fetch_more_tokens()
  107. if self.tokens:
  108. return self.tokens[0]
  109. def get_token(self):
  110. # Return the next token.
  111. while self.need_more_tokens():
  112. self.fetch_more_tokens()
  113. if self.tokens:
  114. self.tokens_taken += 1
  115. return self.tokens.pop(0)
  116. # Private methods.
  117. def need_more_tokens(self):
  118. if self.done:
  119. return False
  120. if not self.tokens:
  121. return True
  122. # The current token may be a potential simple key, so we
  123. # need to look further.
  124. self.stale_possible_simple_keys()
  125. if self.next_possible_simple_key() == self.tokens_taken:
  126. return True
  127. def fetch_more_tokens(self):
  128. # Eat whitespaces and comments until we reach the next token.
  129. self.scan_to_next_token()
  130. # Remove obsolete possible simple keys.
  131. self.stale_possible_simple_keys()
  132. # Compare the current indentation and column. It may add some tokens
  133. # and decrease the current indentation level.
  134. self.unwind_indent(self.column)
  135. # Peek the next character.
  136. ch = self.peek()
  137. # Is it the end of stream?
  138. if ch == '\0':
  139. return self.fetch_stream_end()
  140. # Is it a directive?
  141. if ch == '%' and self.check_directive():
  142. return self.fetch_directive()
  143. # Is it the document start?
  144. if ch == '-' and self.check_document_start():
  145. return self.fetch_document_start()
  146. # Is it the document end?
  147. if ch == '.' and self.check_document_end():
  148. return self.fetch_document_end()
  149. # TODO: support for BOM within a stream.
  150. #if ch == '\uFEFF':
  151. # return self.fetch_bom() <-- issue BOMToken
  152. # Note: the order of the following checks is NOT significant.
  153. # Is it the flow sequence start indicator?
  154. if ch == '[':
  155. return self.fetch_flow_sequence_start()
  156. # Is it the flow mapping start indicator?
  157. if ch == '{':
  158. return self.fetch_flow_mapping_start()
  159. # Is it the flow sequence end indicator?
  160. if ch == ']':
  161. return self.fetch_flow_sequence_end()
  162. # Is it the flow mapping end indicator?
  163. if ch == '}':
  164. return self.fetch_flow_mapping_end()
  165. # Is it the flow entry indicator?
  166. if ch == ',':
  167. return self.fetch_flow_entry()
  168. # Is it the block entry indicator?
  169. if ch == '-' and self.check_block_entry():
  170. return self.fetch_block_entry()
  171. # Is it the key indicator?
  172. if ch == '?' and self.check_key():
  173. return self.fetch_key()
  174. # Is it the value indicator?
  175. if ch == ':' and self.check_value():
  176. return self.fetch_value()
  177. # Is it an alias?
  178. if ch == '*':
  179. return self.fetch_alias()
  180. # Is it an anchor?
  181. if ch == '&':
  182. return self.fetch_anchor()
  183. # Is it a tag?
  184. if ch == '!':
  185. return self.fetch_tag()
  186. # Is it a literal scalar?
  187. if ch == '|' and not self.flow_level:
  188. return self.fetch_literal()
  189. # Is it a folded scalar?
  190. if ch == '>' and not self.flow_level:
  191. return self.fetch_folded()
  192. # Is it a single quoted scalar?
  193. if ch == '\'':
  194. return self.fetch_single()
  195. # Is it a double quoted scalar?
  196. if ch == '\"':
  197. return self.fetch_double()
  198. # It must be a plain scalar then.
  199. if self.check_plain():
  200. return self.fetch_plain()
  201. # No? It's an error. Let's produce a nice error message.
  202. raise ScannerError("while scanning for the next token", None,
  203. "found character %r that cannot start any token" % ch,
  204. self.get_mark())
  205. # Simple keys treatment.
  206. def next_possible_simple_key(self):
  207. # Return the number of the nearest possible simple key. Actually we
  208. # don't need to loop through the whole dictionary. We may replace it
  209. # with the following code:
  210. # if not self.possible_simple_keys:
  211. # return None
  212. # return self.possible_simple_keys[
  213. # min(self.possible_simple_keys.keys())].token_number
  214. min_token_number = None
  215. for level in self.possible_simple_keys:
  216. key = self.possible_simple_keys[level]
  217. if min_token_number is None or key.token_number < min_token_number:
  218. min_token_number = key.token_number
  219. return min_token_number
  220. def stale_possible_simple_keys(self):
  221. # Remove entries that are no longer possible simple keys. According to
  222. # the YAML specification, simple keys
  223. # - should be limited to a single line,
  224. # - should be no longer than 1024 characters.
  225. # Disabling this procedure will allow simple keys of any length and
  226. # height (may cause problems if indentation is broken though).
  227. for level in list(self.possible_simple_keys):
  228. key = self.possible_simple_keys[level]
  229. if key.line != self.line \
  230. or self.index-key.index > 1024:
  231. if key.required:
  232. raise ScannerError("while scanning a simple key", key.mark,
  233. "could not find expected ':'", self.get_mark())
  234. del self.possible_simple_keys[level]
  235. def save_possible_simple_key(self):
  236. # The next token may start a simple key. We check if it's possible
  237. # and save its position. This function is called for
  238. # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
  239. # Check if a simple key is required at the current position.
  240. required = not self.flow_level and self.indent == self.column
  241. # The next token might be a simple key. Let's save it's number and
  242. # position.
  243. if self.allow_simple_key:
  244. self.remove_possible_simple_key()
  245. token_number = self.tokens_taken+len(self.tokens)
  246. key = SimpleKey(token_number, required,
  247. self.index, self.line, self.column, self.get_mark())
  248. self.possible_simple_keys[self.flow_level] = key
  249. def remove_possible_simple_key(self):
  250. # Remove the saved possible key position at the current flow level.
  251. if self.flow_level in self.possible_simple_keys:
  252. key = self.possible_simple_keys[self.flow_level]
  253. if key.required:
  254. raise ScannerError("while scanning a simple key", key.mark,
  255. "could not find expected ':'", self.get_mark())
  256. del self.possible_simple_keys[self.flow_level]
  257. # Indentation functions.
  258. def unwind_indent(self, column):
  259. ## In flow context, tokens should respect indentation.
  260. ## Actually the condition should be `self.indent >= column` according to
  261. ## the spec. But this condition will prohibit intuitively correct
  262. ## constructions such as
  263. ## key : {
  264. ## }
  265. #if self.flow_level and self.indent > column:
  266. # raise ScannerError(None, None,
  267. # "invalid intendation or unclosed '[' or '{'",
  268. # self.get_mark())
  269. # In the flow context, indentation is ignored. We make the scanner less
  270. # restrictive then specification requires.
  271. if self.flow_level:
  272. return
  273. # In block context, we may need to issue the BLOCK-END tokens.
  274. while self.indent > column:
  275. mark = self.get_mark()
  276. self.indent = self.indents.pop()
  277. self.tokens.append(BlockEndToken(mark, mark))
  278. def add_indent(self, column):
  279. # Check if we need to increase indentation.
  280. if self.indent < column:
  281. self.indents.append(self.indent)
  282. self.indent = column
  283. return True
  284. return False
  285. # Fetchers.
  286. def fetch_stream_start(self):
  287. # We always add STREAM-START as the first token and STREAM-END as the
  288. # last token.
  289. # Read the token.
  290. mark = self.get_mark()
  291. # Add STREAM-START.
  292. self.tokens.append(StreamStartToken(mark, mark,
  293. encoding=self.encoding))
  294. def fetch_stream_end(self):
  295. # Set the current intendation to -1.
  296. self.unwind_indent(-1)
  297. # Reset simple keys.
  298. self.remove_possible_simple_key()
  299. self.allow_simple_key = False
  300. self.possible_simple_keys = {}
  301. # Read the token.
  302. mark = self.get_mark()
  303. # Add STREAM-END.
  304. self.tokens.append(StreamEndToken(mark, mark))
  305. # The steam is finished.
  306. self.done = True
  307. def fetch_directive(self):
  308. # Set the current intendation to -1.
  309. self.unwind_indent(-1)
  310. # Reset simple keys.
  311. self.remove_possible_simple_key()
  312. self.allow_simple_key = False
  313. # Scan and add DIRECTIVE.
  314. self.tokens.append(self.scan_directive())
  315. def fetch_document_start(self):
  316. self.fetch_document_indicator(DocumentStartToken)
  317. def fetch_document_end(self):
  318. self.fetch_document_indicator(DocumentEndToken)
  319. def fetch_document_indicator(self, TokenClass):
  320. # Set the current intendation to -1.
  321. self.unwind_indent(-1)
  322. # Reset simple keys. Note that there could not be a block collection
  323. # after '---'.
  324. self.remove_possible_simple_key()
  325. self.allow_simple_key = False
  326. # Add DOCUMENT-START or DOCUMENT-END.
  327. start_mark = self.get_mark()
  328. self.forward(3)
  329. end_mark = self.get_mark()
  330. self.tokens.append(TokenClass(start_mark, end_mark))
  331. def fetch_flow_sequence_start(self):
  332. self.fetch_flow_collection_start(FlowSequenceStartToken)
  333. def fetch_flow_mapping_start(self):
  334. self.fetch_flow_collection_start(FlowMappingStartToken)
  335. def fetch_flow_collection_start(self, TokenClass):
  336. # '[' and '{' may start a simple key.
  337. self.save_possible_simple_key()
  338. # Increase the flow level.
  339. self.flow_level += 1
  340. # Simple keys are allowed after '[' and '{'.
  341. self.allow_simple_key = True
  342. # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
  343. start_mark = self.get_mark()
  344. self.forward()
  345. end_mark = self.get_mark()
  346. self.tokens.append(TokenClass(start_mark, end_mark))
  347. def fetch_flow_sequence_end(self):
  348. self.fetch_flow_collection_end(FlowSequenceEndToken)
  349. def fetch_flow_mapping_end(self):
  350. self.fetch_flow_collection_end(FlowMappingEndToken)
  351. def fetch_flow_collection_end(self, TokenClass):
  352. # Reset possible simple key on the current level.
  353. self.remove_possible_simple_key()
  354. # Decrease the flow level.
  355. self.flow_level -= 1
  356. # No simple keys after ']' or '}'.
  357. self.allow_simple_key = False
  358. # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
  359. start_mark = self.get_mark()
  360. self.forward()
  361. end_mark = self.get_mark()
  362. self.tokens.append(TokenClass(start_mark, end_mark))
  363. def fetch_flow_entry(self):
  364. # Simple keys are allowed after ','.
  365. self.allow_simple_key = True
  366. # Reset possible simple key on the current level.
  367. self.remove_possible_simple_key()
  368. # Add FLOW-ENTRY.
  369. start_mark = self.get_mark()
  370. self.forward()
  371. end_mark = self.get_mark()
  372. self.tokens.append(FlowEntryToken(start_mark, end_mark))
  373. def fetch_block_entry(self):
  374. # Block context needs additional checks.
  375. if not self.flow_level:
  376. # Are we allowed to start a new entry?
  377. if not self.allow_simple_key:
  378. raise ScannerError(None, None,
  379. "sequence entries are not allowed here",
  380. self.get_mark())
  381. # We may need to add BLOCK-SEQUENCE-START.
  382. if self.add_indent(self.column):
  383. mark = self.get_mark()
  384. self.tokens.append(BlockSequenceStartToken(mark, mark))
  385. # It's an error for the block entry to occur in the flow context,
  386. # but we let the parser detect this.
  387. else:
  388. pass
  389. # Simple keys are allowed after '-'.
  390. self.allow_simple_key = True
  391. # Reset possible simple key on the current level.
  392. self.remove_possible_simple_key()
  393. # Add BLOCK-ENTRY.
  394. start_mark = self.get_mark()
  395. self.forward()
  396. end_mark = self.get_mark()
  397. self.tokens.append(BlockEntryToken(start_mark, end_mark))
  398. def fetch_key(self):
  399. # Block context needs additional checks.
  400. if not self.flow_level:
  401. # Are we allowed to start a key (not nessesary a simple)?
  402. if not self.allow_simple_key:
  403. raise ScannerError(None, None,
  404. "mapping keys are not allowed here",
  405. self.get_mark())
  406. # We may need to add BLOCK-MAPPING-START.
  407. if self.add_indent(self.column):
  408. mark = self.get_mark()
  409. self.tokens.append(BlockMappingStartToken(mark, mark))
  410. # Simple keys are allowed after '?' in the block context.
  411. self.allow_simple_key = not self.flow_level
  412. # Reset possible simple key on the current level.
  413. self.remove_possible_simple_key()
  414. # Add KEY.
  415. start_mark = self.get_mark()
  416. self.forward()
  417. end_mark = self.get_mark()
  418. self.tokens.append(KeyToken(start_mark, end_mark))
  419. def fetch_value(self):
  420. # Do we determine a simple key?
  421. if self.flow_level in self.possible_simple_keys:
  422. # Add KEY.
  423. key = self.possible_simple_keys[self.flow_level]
  424. del self.possible_simple_keys[self.flow_level]
  425. self.tokens.insert(key.token_number-self.tokens_taken,
  426. KeyToken(key.mark, key.mark))
  427. # If this key starts a new block mapping, we need to add
  428. # BLOCK-MAPPING-START.
  429. if not self.flow_level:
  430. if self.add_indent(key.column):
  431. self.tokens.insert(key.token_number-self.tokens_taken,
  432. BlockMappingStartToken(key.mark, key.mark))
  433. # There cannot be two simple keys one after another.
  434. self.allow_simple_key = False
  435. # It must be a part of a complex key.
  436. else:
  437. # Block context needs additional checks.
  438. # (Do we really need them? They will be catched by the parser
  439. # anyway.)
  440. if not self.flow_level:
  441. # We are allowed to start a complex value if and only if
  442. # we can start a simple key.
  443. if not self.allow_simple_key:
  444. raise ScannerError(None, None,
  445. "mapping values are not allowed here",
  446. self.get_mark())
  447. # If this value starts a new block mapping, we need to add
  448. # BLOCK-MAPPING-START. It will be detected as an error later by
  449. # the parser.
  450. if not self.flow_level:
  451. if self.add_indent(self.column):
  452. mark = self.get_mark()
  453. self.tokens.append(BlockMappingStartToken(mark, mark))
  454. # Simple keys are allowed after ':' in the block context.
  455. self.allow_simple_key = not self.flow_level
  456. # Reset possible simple key on the current level.
  457. self.remove_possible_simple_key()
  458. # Add VALUE.
  459. start_mark = self.get_mark()
  460. self.forward()
  461. end_mark = self.get_mark()
  462. self.tokens.append(ValueToken(start_mark, end_mark))
  463. def fetch_alias(self):
  464. # ALIAS could be a simple key.
  465. self.save_possible_simple_key()
  466. # No simple keys after ALIAS.
  467. self.allow_simple_key = False
  468. # Scan and add ALIAS.
  469. self.tokens.append(self.scan_anchor(AliasToken))
  470. def fetch_anchor(self):
  471. # ANCHOR could start a simple key.
  472. self.save_possible_simple_key()
  473. # No simple keys after ANCHOR.
  474. self.allow_simple_key = False
  475. # Scan and add ANCHOR.
  476. self.tokens.append(self.scan_anchor(AnchorToken))
  477. def fetch_tag(self):
  478. # TAG could start a simple key.
  479. self.save_possible_simple_key()
  480. # No simple keys after TAG.
  481. self.allow_simple_key = False
  482. # Scan and add TAG.
  483. self.tokens.append(self.scan_tag())
  484. def fetch_literal(self):
  485. self.fetch_block_scalar(style='|')
  486. def fetch_folded(self):
  487. self.fetch_block_scalar(style='>')
  488. def fetch_block_scalar(self, style):
  489. # A simple key may follow a block scalar.
  490. self.allow_simple_key = True
  491. # Reset possible simple key on the current level.
  492. self.remove_possible_simple_key()
  493. # Scan and add SCALAR.
  494. self.tokens.append(self.scan_block_scalar(style))
  495. def fetch_single(self):
  496. self.fetch_flow_scalar(style='\'')
  497. def fetch_double(self):
  498. self.fetch_flow_scalar(style='"')
  499. def fetch_flow_scalar(self, style):
  500. # A flow scalar could be a simple key.
  501. self.save_possible_simple_key()
  502. # No simple keys after flow scalars.
  503. self.allow_simple_key = False
  504. # Scan and add SCALAR.
  505. self.tokens.append(self.scan_flow_scalar(style))
  506. def fetch_plain(self):
  507. # A plain scalar could be a simple key.
  508. self.save_possible_simple_key()
  509. # No simple keys after plain scalars. But note that `scan_plain` will
  510. # change this flag if the scan is finished at the beginning of the
  511. # line.
  512. self.allow_simple_key = False
  513. # Scan and add SCALAR. May change `allow_simple_key`.
  514. self.tokens.append(self.scan_plain())
  515. # Checkers.
  516. def check_directive(self):
  517. # DIRECTIVE: ^ '%' ...
  518. # The '%' indicator is already checked.
  519. if self.column == 0:
  520. return True
  521. def check_document_start(self):
  522. # DOCUMENT-START: ^ '---' (' '|'\n')
  523. if self.column == 0:
  524. if self.prefix(3) == '---' \
  525. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  526. return True
  527. def check_document_end(self):
  528. # DOCUMENT-END: ^ '...' (' '|'\n')
  529. if self.column == 0:
  530. if self.prefix(3) == '...' \
  531. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  532. return True
  533. def check_block_entry(self):
  534. # BLOCK-ENTRY: '-' (' '|'\n')
  535. return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
  536. def check_key(self):
  537. # KEY(flow context): '?'
  538. if self.flow_level:
  539. return True
  540. # KEY(block context): '?' (' '|'\n')
  541. else:
  542. return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
  543. def check_value(self):
  544. # VALUE(flow context): ':'
  545. if self.flow_level:
  546. return True
  547. # VALUE(block context): ':' (' '|'\n')
  548. else:
  549. return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
  550. def check_plain(self):
  551. # A plain scalar may start with any non-space character except:
  552. # '-', '?', ':', ',', '[', ']', '{', '}',
  553. # '#', '&', '*', '!', '|', '>', '\'', '\"',
  554. # '%', '@', '`'.
  555. #
  556. # It may also start with
  557. # '-', '?', ':'
  558. # if it is followed by a non-space character.
  559. #
  560. # Note that we limit the last rule to the block context (except the
  561. # '-' character) because we want the flow context to be space
  562. # independent.
  563. ch = self.peek()
  564. return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
  565. or (self.peek(1) not in '\0 \t\r\n\x85\u2028\u2029'
  566. and (ch == '-' or (not self.flow_level and ch in '?:')))
  567. # Scanners.
  568. def scan_to_next_token(self):
  569. # We ignore spaces, line breaks and comments.
  570. # If we find a line break in the block context, we set the flag
  571. # `allow_simple_key` on.
  572. # The byte order mark is stripped if it's the first character in the
  573. # stream. We do not yet support BOM inside the stream as the
  574. # specification requires. Any such mark will be considered as a part
  575. # of the document.
  576. #
  577. # TODO: We need to make tab handling rules more sane. A good rule is
  578. # Tabs cannot precede tokens
  579. # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
  580. # KEY(block), VALUE(block), BLOCK-ENTRY
  581. # So the checking code is
  582. # if <TAB>:
  583. # self.allow_simple_keys = False
  584. # We also need to add the check for `allow_simple_keys == True` to
  585. # `unwind_indent` before issuing BLOCK-END.
  586. # Scanners for block, flow, and plain scalars need to be modified.
  587. if self.index == 0 and self.peek() == '\uFEFF':
  588. self.forward()
  589. found = False
  590. while not found:
  591. while self.peek() == ' ':
  592. self.forward()
  593. if self.peek() == '#':
  594. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  595. self.forward()
  596. if self.scan_line_break():
  597. if not self.flow_level:
  598. self.allow_simple_key = True
  599. else:
  600. found = True
  601. def scan_directive(self):
  602. # See the specification for details.
  603. start_mark = self.get_mark()
  604. self.forward()
  605. name = self.scan_directive_name(start_mark)
  606. value = None
  607. if name == 'YAML':
  608. value = self.scan_yaml_directive_value(start_mark)
  609. end_mark = self.get_mark()
  610. elif name == 'TAG':
  611. value = self.scan_tag_directive_value(start_mark)
  612. end_mark = self.get_mark()
  613. else:
  614. end_mark = self.get_mark()
  615. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  616. self.forward()
  617. self.scan_directive_ignored_line(start_mark)
  618. return DirectiveToken(name, value, start_mark, end_mark)
  619. def scan_directive_name(self, start_mark):
  620. # See the specification for details.
  621. length = 0
  622. ch = self.peek(length)
  623. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  624. or ch in '-_':
  625. length += 1
  626. ch = self.peek(length)
  627. if not length:
  628. raise ScannerError("while scanning a directive", start_mark,
  629. "expected alphabetic or numeric character, but found %r"
  630. % ch, self.get_mark())
  631. value = self.prefix(length)
  632. self.forward(length)
  633. ch = self.peek()
  634. if ch not in '\0 \r\n\x85\u2028\u2029':
  635. raise ScannerError("while scanning a directive", start_mark,
  636. "expected alphabetic or numeric character, but found %r"
  637. % ch, self.get_mark())
  638. return value
  639. def scan_yaml_directive_value(self, start_mark):
  640. # See the specification for details.
  641. while self.peek() == ' ':
  642. self.forward()
  643. major = self.scan_yaml_directive_number(start_mark)
  644. if self.peek() != '.':
  645. raise ScannerError("while scanning a directive", start_mark,
  646. "expected a digit or '.', but found %r" % self.peek(),
  647. self.get_mark())
  648. self.forward()
  649. minor = self.scan_yaml_directive_number(start_mark)
  650. if self.peek() not in '\0 \r\n\x85\u2028\u2029':
  651. raise ScannerError("while scanning a directive", start_mark,
  652. "expected a digit or ' ', but found %r" % self.peek(),
  653. self.get_mark())
  654. return (major, minor)
  655. def scan_yaml_directive_number(self, start_mark):
  656. # See the specification for details.
  657. ch = self.peek()
  658. if not ('0' <= ch <= '9'):
  659. raise ScannerError("while scanning a directive", start_mark,
  660. "expected a digit, but found %r" % ch, self.get_mark())
  661. length = 0
  662. while '0' <= self.peek(length) <= '9':
  663. length += 1
  664. value = int(self.prefix(length))
  665. self.forward(length)
  666. return value
  667. def scan_tag_directive_value(self, start_mark):
  668. # See the specification for details.
  669. while self.peek() == ' ':
  670. self.forward()
  671. handle = self.scan_tag_directive_handle(start_mark)
  672. while self.peek() == ' ':
  673. self.forward()
  674. prefix = self.scan_tag_directive_prefix(start_mark)
  675. return (handle, prefix)
  676. def scan_tag_directive_handle(self, start_mark):
  677. # See the specification for details.
  678. value = self.scan_tag_handle('directive', start_mark)
  679. ch = self.peek()
  680. if ch != ' ':
  681. raise ScannerError("while scanning a directive", start_mark,
  682. "expected ' ', but found %r" % ch, self.get_mark())
  683. return value
  684. def scan_tag_directive_prefix(self, start_mark):
  685. # See the specification for details.
  686. value = self.scan_tag_uri('directive', start_mark)
  687. ch = self.peek()
  688. if ch not in '\0 \r\n\x85\u2028\u2029':
  689. raise ScannerError("while scanning a directive", start_mark,
  690. "expected ' ', but found %r" % ch, self.get_mark())
  691. return value
  692. def scan_directive_ignored_line(self, start_mark):
  693. # See the specification for details.
  694. while self.peek() == ' ':
  695. self.forward()
  696. if self.peek() == '#':
  697. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  698. self.forward()
  699. ch = self.peek()
  700. if ch not in '\0\r\n\x85\u2028\u2029':
  701. raise ScannerError("while scanning a directive", start_mark,
  702. "expected a comment or a line break, but found %r"
  703. % ch, self.get_mark())
  704. self.scan_line_break()
  705. def scan_anchor(self, TokenClass):
  706. # The specification does not restrict characters for anchors and
  707. # aliases. This may lead to problems, for instance, the document:
  708. # [ *alias, value ]
  709. # can be interpteted in two ways, as
  710. # [ "value" ]
  711. # and
  712. # [ *alias , "value" ]
  713. # Therefore we restrict aliases to numbers and ASCII letters.
  714. start_mark = self.get_mark()
  715. indicator = self.peek()
  716. if indicator == '*':
  717. name = 'alias'
  718. else:
  719. name = 'anchor'
  720. self.forward()
  721. length = 0
  722. ch = self.peek(length)
  723. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  724. or ch in '-_':
  725. length += 1
  726. ch = self.peek(length)
  727. if not length:
  728. raise ScannerError("while scanning an %s" % name, start_mark,
  729. "expected alphabetic or numeric character, but found %r"
  730. % ch, self.get_mark())
  731. value = self.prefix(length)
  732. self.forward(length)
  733. ch = self.peek()
  734. if ch not in '\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
  735. raise ScannerError("while scanning an %s" % name, start_mark,
  736. "expected alphabetic or numeric character, but found %r"
  737. % ch, self.get_mark())
  738. end_mark = self.get_mark()
  739. return TokenClass(value, start_mark, end_mark)
  740. def scan_tag(self):
  741. # See the specification for details.
  742. start_mark = self.get_mark()
  743. ch = self.peek(1)
  744. if ch == '<':
  745. handle = None
  746. self.forward(2)
  747. suffix = self.scan_tag_uri('tag', start_mark)
  748. if self.peek() != '>':
  749. raise ScannerError("while parsing a tag", start_mark,
  750. "expected '>', but found %r" % self.peek(),
  751. self.get_mark())
  752. self.forward()
  753. elif ch in '\0 \t\r\n\x85\u2028\u2029':
  754. handle = None
  755. suffix = '!'
  756. self.forward()
  757. else:
  758. length = 1
  759. use_handle = False
  760. while ch not in '\0 \r\n\x85\u2028\u2029':
  761. if ch == '!':
  762. use_handle = True
  763. break
  764. length += 1
  765. ch = self.peek(length)
  766. handle = '!'
  767. if use_handle:
  768. handle = self.scan_tag_handle('tag', start_mark)
  769. else:
  770. handle = '!'
  771. self.forward()
  772. suffix = self.scan_tag_uri('tag', start_mark)
  773. ch = self.peek()
  774. if ch not in '\0 \r\n\x85\u2028\u2029':
  775. raise ScannerError("while scanning a tag", start_mark,
  776. "expected ' ', but found %r" % ch, self.get_mark())
  777. value = (handle, suffix)
  778. end_mark = self.get_mark()
  779. return TagToken(value, start_mark, end_mark)
  780. def scan_block_scalar(self, style):
  781. # See the specification for details.
  782. if style == '>':
  783. folded = True
  784. else:
  785. folded = False
  786. chunks = []
  787. start_mark = self.get_mark()
  788. # Scan the header.
  789. self.forward()
  790. chomping, increment = self.scan_block_scalar_indicators(start_mark)
  791. self.scan_block_scalar_ignored_line(start_mark)
  792. # Determine the indentation level and go to the first non-empty line.
  793. min_indent = self.indent+1
  794. if min_indent < 1:
  795. min_indent = 1
  796. if increment is None:
  797. breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
  798. indent = max(min_indent, max_indent)
  799. else:
  800. indent = min_indent+increment-1
  801. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  802. line_break = ''
  803. # Scan the inner part of the block scalar.
  804. while self.column == indent and self.peek() != '\0':
  805. chunks.extend(breaks)
  806. leading_non_space = self.peek() not in ' \t'
  807. length = 0
  808. while self.peek(length) not in '\0\r\n\x85\u2028\u2029':
  809. length += 1
  810. chunks.append(self.prefix(length))
  811. self.forward(length)
  812. line_break = self.scan_line_break()
  813. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  814. if self.column == indent and self.peek() != '\0':
  815. # Unfortunately, folding rules are ambiguous.
  816. #
  817. # This is the folding according to the specification:
  818. if folded and line_break == '\n' \
  819. and leading_non_space and self.peek() not in ' \t':
  820. if not breaks:
  821. chunks.append(' ')
  822. else:
  823. chunks.append(line_break)
  824. # This is Clark Evans's interpretation (also in the spec
  825. # examples):
  826. #
  827. #if folded and line_break == '\n':
  828. # if not breaks:
  829. # if self.peek() not in ' \t':
  830. # chunks.append(' ')
  831. # else:
  832. # chunks.append(line_break)
  833. #else:
  834. # chunks.append(line_break)
  835. else:
  836. break
  837. # Chomp the tail.
  838. if chomping is not False:
  839. chunks.append(line_break)
  840. if chomping is True:
  841. chunks.extend(breaks)
  842. # We are done.
  843. return ScalarToken(''.join(chunks), False, start_mark, end_mark,
  844. style)
  845. def scan_block_scalar_indicators(self, start_mark):
  846. # See the specification for details.
  847. chomping = None
  848. increment = None
  849. ch = self.peek()
  850. if ch in '+-':
  851. if ch == '+':
  852. chomping = True
  853. else:
  854. chomping = False
  855. self.forward()
  856. ch = self.peek()
  857. if ch in '0123456789':
  858. increment = int(ch)
  859. if increment == 0:
  860. raise ScannerError("while scanning a block scalar", start_mark,
  861. "expected indentation indicator in the range 1-9, but found 0",
  862. self.get_mark())
  863. self.forward()
  864. elif ch in '0123456789':
  865. increment = int(ch)
  866. if increment == 0:
  867. raise ScannerError("while scanning a block scalar", start_mark,
  868. "expected indentation indicator in the range 1-9, but found 0",
  869. self.get_mark())
  870. self.forward()
  871. ch = self.peek()
  872. if ch in '+-':
  873. if ch == '+':
  874. chomping = True
  875. else:
  876. chomping = False
  877. self.forward()
  878. ch = self.peek()
  879. if ch not in '\0 \r\n\x85\u2028\u2029':
  880. raise ScannerError("while scanning a block scalar", start_mark,
  881. "expected chomping or indentation indicators, but found %r"
  882. % ch, self.get_mark())
  883. return chomping, increment
  884. def scan_block_scalar_ignored_line(self, start_mark):
  885. # See the specification for details.
  886. while self.peek() == ' ':
  887. self.forward()
  888. if self.peek() == '#':
  889. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  890. self.forward()
  891. ch = self.peek()
  892. if ch not in '\0\r\n\x85\u2028\u2029':
  893. raise ScannerError("while scanning a block scalar", start_mark,
  894. "expected a comment or a line break, but found %r" % ch,
  895. self.get_mark())
  896. self.scan_line_break()
  897. def scan_block_scalar_indentation(self):
  898. # See the specification for details.
  899. chunks = []
  900. max_indent = 0
  901. end_mark = self.get_mark()
  902. while self.peek() in ' \r\n\x85\u2028\u2029':
  903. if self.peek() != ' ':
  904. chunks.append(self.scan_line_break())
  905. end_mark = self.get_mark()
  906. else:
  907. self.forward()
  908. if self.column > max_indent:
  909. max_indent = self.column
  910. return chunks, max_indent, end_mark
  911. def scan_block_scalar_breaks(self, indent):
  912. # See the specification for details.
  913. chunks = []
  914. end_mark = self.get_mark()
  915. while self.column < indent and self.peek() == ' ':
  916. self.forward()
  917. while self.peek() in '\r\n\x85\u2028\u2029':
  918. chunks.append(self.scan_line_break())
  919. end_mark = self.get_mark()
  920. while self.column < indent and self.peek() == ' ':
  921. self.forward()
  922. return chunks, end_mark
  923. def scan_flow_scalar(self, style):
  924. # See the specification for details.
  925. # Note that we loose indentation rules for quoted scalars. Quoted
  926. # scalars don't need to adhere indentation because " and ' clearly
  927. # mark the beginning and the end of them. Therefore we are less
  928. # restrictive then the specification requires. We only need to check
  929. # that document separators are not included in scalars.
  930. if style == '"':
  931. double = True
  932. else:
  933. double = False
  934. chunks = []
  935. start_mark = self.get_mark()
  936. quote = self.peek()
  937. self.forward()
  938. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  939. while self.peek() != quote:
  940. chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
  941. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  942. self.forward()
  943. end_mark = self.get_mark()
  944. return ScalarToken(''.join(chunks), False, start_mark, end_mark,
  945. style)
  946. ESCAPE_REPLACEMENTS = {
  947. '0': '\0',
  948. 'a': '\x07',
  949. 'b': '\x08',
  950. 't': '\x09',
  951. '\t': '\x09',
  952. 'n': '\x0A',
  953. 'v': '\x0B',
  954. 'f': '\x0C',
  955. 'r': '\x0D',
  956. 'e': '\x1B',
  957. ' ': '\x20',
  958. '\"': '\"',
  959. '\\': '\\',
  960. 'N': '\x85',
  961. '_': '\xA0',
  962. 'L': '\u2028',
  963. 'P': '\u2029',
  964. }
  965. ESCAPE_CODES = {
  966. 'x': 2,
  967. 'u': 4,
  968. 'U': 8,
  969. }
  970. def scan_flow_scalar_non_spaces(self, double, start_mark):
  971. # See the specification for details.
  972. chunks = []
  973. while True:
  974. length = 0
  975. while self.peek(length) not in '\'\"\\\0 \t\r\n\x85\u2028\u2029':
  976. length += 1
  977. if length:
  978. chunks.append(self.prefix(length))
  979. self.forward(length)
  980. ch = self.peek()
  981. if not double and ch == '\'' and self.peek(1) == '\'':
  982. chunks.append('\'')
  983. self.forward(2)
  984. elif (double and ch == '\'') or (not double and ch in '\"\\'):
  985. chunks.append(ch)
  986. self.forward()
  987. elif double and ch == '\\':
  988. self.forward()
  989. ch = self.peek()
  990. if ch in self.ESCAPE_REPLACEMENTS:
  991. chunks.append(self.ESCAPE_REPLACEMENTS[ch])
  992. self.forward()
  993. elif ch in self.ESCAPE_CODES:
  994. length = self.ESCAPE_CODES[ch]
  995. self.forward()
  996. for k in range(length):
  997. if self.peek(k) not in '0123456789ABCDEFabcdef':
  998. raise ScannerError("while scanning a double-quoted scalar", start_mark,
  999. "expected escape sequence of %d hexdecimal numbers, but found %r" %
  1000. (length, self.peek(k)), self.get_mark())
  1001. code = int(self.prefix(length), 16)
  1002. chunks.append(chr(code))
  1003. self.forward(length)
  1004. elif ch in '\r\n\x85\u2028\u2029':
  1005. self.scan_line_break()
  1006. chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
  1007. else:
  1008. raise ScannerError("while scanning a double-quoted scalar", start_mark,
  1009. "found unknown escape character %r" % ch, self.get_mark())
  1010. else:
  1011. return chunks
  1012. def scan_flow_scalar_spaces(self, double, start_mark):
  1013. # See the specification for details.
  1014. chunks = []
  1015. length = 0
  1016. while self.peek(length) in ' \t':
  1017. length += 1
  1018. whitespaces = self.prefix(length)
  1019. self.forward(length)
  1020. ch = self.peek()
  1021. if ch == '\0':
  1022. raise ScannerError("while scanning a quoted scalar", start_mark,
  1023. "found unexpected end of stream", self.get_mark())
  1024. elif ch in '\r\n\x85\u2028\u2029':
  1025. line_break = self.scan_line_break()
  1026. breaks = self.scan_flow_scalar_breaks(double, start_mark)
  1027. if line_break != '\n':
  1028. chunks.append(line_break)
  1029. elif not breaks:
  1030. chunks.append(' ')
  1031. chunks.extend(breaks)
  1032. else:
  1033. chunks.append(whitespaces)
  1034. return chunks
  1035. def scan_flow_scalar_breaks(self, double, start_mark):
  1036. # See the specification for details.
  1037. chunks = []
  1038. while True:
  1039. # Instead of checking indentation, we check for document
  1040. # separators.
  1041. prefix = self.prefix(3)
  1042. if (prefix == '---' or prefix == '...') \
  1043. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  1044. raise ScannerError("while scanning a quoted scalar", start_mark,
  1045. "found unexpected document separator", self.get_mark())
  1046. while self.peek() in ' \t':
  1047. self.forward()
  1048. if self.peek() in '\r\n\x85\u2028\u2029':
  1049. chunks.append(self.scan_line_break())
  1050. else:
  1051. return chunks
  1052. def scan_plain(self):
  1053. # See the specification for details.
  1054. # We add an additional restriction for the flow context:
  1055. # plain scalars in the flow context cannot contain ',', ':' and '?'.
  1056. # We also keep track of the `allow_simple_key` flag here.
  1057. # Indentation rules are loosed for the flow context.
  1058. chunks = []
  1059. start_mark = self.get_mark()
  1060. end_mark = start_mark
  1061. indent = self.indent+1
  1062. # We allow zero indentation for scalars, but then we need to check for
  1063. # document separators at the beginning of the line.
  1064. #if indent == 0:
  1065. # indent = 1
  1066. spaces = []
  1067. while True:
  1068. length = 0
  1069. if self.peek() == '#':
  1070. break
  1071. while True:
  1072. ch = self.peek(length)
  1073. if ch in '\0 \t\r\n\x85\u2028\u2029' \
  1074. or (not self.flow_level and ch == ':' and
  1075. self.peek(length+1) in '\0 \t\r\n\x85\u2028\u2029') \
  1076. or (self.flow_level and ch in ',:?[]{}'):
  1077. break
  1078. length += 1
  1079. # It's not clear what we should do with ':' in the flow context.
  1080. if (self.flow_level and ch == ':'
  1081. and self.peek(length+1) not in '\0 \t\r\n\x85\u2028\u2029,[]{}'):
  1082. self.forward(length)
  1083. raise ScannerError("while scanning a plain scalar", start_mark,
  1084. "found unexpected ':'", self.get_mark(),
  1085. "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.")
  1086. if length == 0:
  1087. break
  1088. self.allow_simple_key = False
  1089. chunks.extend(spaces)
  1090. chunks.append(self.prefix(length))
  1091. self.forward(length)
  1092. end_mark = self.get_mark()
  1093. spaces = self.scan_plain_spaces(indent, start_mark)
  1094. if not spaces or self.peek() == '#' \
  1095. or (not self.flow_level and self.column < indent):
  1096. break
  1097. return ScalarToken(''.join(chunks), True, start_mark, end_mark)
  1098. def scan_plain_spaces(self, indent, start_mark):
  1099. # See the specification for details.
  1100. # The specification is really confusing about tabs in plain scalars.
  1101. # We just forbid them completely. Do not use tabs in YAML!
  1102. chunks = []
  1103. length = 0
  1104. while self.peek(length) in ' ':
  1105. length += 1
  1106. whitespaces = self.prefix(length)
  1107. self.forward(length)
  1108. ch = self.peek()
  1109. if ch in '\r\n\x85\u2028\u2029':
  1110. line_break = self.scan_line_break()
  1111. self.allow_simple_key = True
  1112. prefix = self.prefix(3)
  1113. if (prefix == '---' or prefix == '...') \
  1114. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  1115. return
  1116. breaks = []
  1117. while self.peek() in ' \r\n\x85\u2028\u2029':
  1118. if self.peek() == ' ':
  1119. self.forward()
  1120. else:
  1121. breaks.append(self.scan_line_break())
  1122. prefix = self.prefix(3)
  1123. if (prefix == '---' or prefix == '...') \
  1124. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  1125. return
  1126. if line_break != '\n':
  1127. chunks.append(line_break)
  1128. elif not breaks:
  1129. chunks.append(' ')
  1130. chunks.extend(breaks)
  1131. elif whitespaces:
  1132. chunks.append(whitespaces)
  1133. return chunks
  1134. def scan_tag_handle(self, name, start_mark):
  1135. # See the specification for details.
  1136. # For some strange reasons, the specification does not allow '_' in
  1137. # tag handles. I have allowed it anyway.
  1138. ch = self.peek()
  1139. if ch != '!':
  1140. raise ScannerError("while scanning a %s" % name, start_mark,
  1141. "expected '!', but found %r" % ch, self.get_mark())
  1142. length = 1
  1143. ch = self.peek(length)
  1144. if ch != ' ':
  1145. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  1146. or ch in '-_':
  1147. length += 1
  1148. ch = self.peek(length)
  1149. if ch != '!':
  1150. self.forward(length)
  1151. raise ScannerError("while scanning a %s" % name, start_mark,
  1152. "expected '!', but found %r" % ch, self.get_mark())
  1153. length += 1
  1154. value = self.prefix(length)
  1155. self.forward(length)
  1156. return value
  1157. def scan_tag_uri(self, name, start_mark):
  1158. # See the specification for details.
  1159. # Note: we do not check if URI is well-formed.
  1160. chunks = []
  1161. length = 0
  1162. ch = self.peek(length)
  1163. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  1164. or ch in '-;/?:@&=+$,_.!~*\'()[]%':
  1165. if ch == '%':
  1166. chunks.append(self.prefix(length))
  1167. self.forward(length)
  1168. length = 0
  1169. chunks.append(self.scan_uri_escapes(name, start_mark))
  1170. else:
  1171. length += 1
  1172. ch = self.peek(length)
  1173. if length:
  1174. chunks.append(self.prefix(length))
  1175. self.forward(length)
  1176. length = 0
  1177. if not chunks:
  1178. raise ScannerError("while parsing a %s" % name, start_mark,
  1179. "expected URI, but found %r" % ch, self.get_mark())
  1180. return ''.join(chunks)
  1181. def scan_uri_escapes(self, name, start_mark):
  1182. # See the specification for details.
  1183. codes = []
  1184. mark = self.get_mark()
  1185. while self.peek() == '%':
  1186. self.forward()
  1187. for k in range(2):
  1188. if self.peek(k) not in '0123456789ABCDEFabcdef':
  1189. raise ScannerError("while scanning a %s" % name, start_mark,
  1190. "expected URI escape sequence of 2 hexdecimal numbers, but found %r"
  1191. % self.peek(k), self.get_mark())
  1192. codes.append(int(self.prefix(2), 16))
  1193. self.forward(2)
  1194. try:
  1195. value = bytes(codes).decode('utf-8')
  1196. except UnicodeDecodeError as exc:
  1197. raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
  1198. return value
  1199. def scan_line_break(self):
  1200. # Transforms:
  1201. # '\r\n' : '\n'
  1202. # '\r' : '\n'
  1203. # '\n' : '\n'
  1204. # '\x85' : '\n'
  1205. # '\u2028' : '\u2028'
  1206. # '\u2029 : '\u2029'
  1207. # default : ''
  1208. ch = self.peek()
  1209. if ch in '\r\n\x85':
  1210. if self.prefix(2) == '\r\n':
  1211. self.forward(2)
  1212. else:
  1213. self.forward()
  1214. return '\n'
  1215. elif ch in '\u2028\u2029':
  1216. self.forward()
  1217. return ch
  1218. return ''
  1219. #try:
  1220. # import psyco
  1221. # psyco.bind(Scanner)
  1222. #except ImportError:
  1223. # pass