generate-test-data.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. #!/usr/bin/env python
  2. import re
  3. import json
  4. # https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
  5. # http://stackoverflow.com/a/13436167/96656
  6. def unisymbol(codePoint):
  7. if codePoint >= 0x0000 and codePoint <= 0xFFFF:
  8. return unichr(codePoint)
  9. elif codePoint >= 0x010000 and codePoint <= 0x10FFFF:
  10. highSurrogate = int((codePoint - 0x10000) / 0x400) + 0xD800
  11. lowSurrogate = int((codePoint - 0x10000) % 0x400) + 0xDC00
  12. return unichr(highSurrogate) + unichr(lowSurrogate)
  13. else:
  14. return 'Error'
  15. def hexify(codePoint):
  16. return 'U+' + hex(codePoint)[2:].upper().zfill(6)
  17. def writeFile(filename, contents):
  18. print filename
  19. with open(filename, 'w') as f:
  20. f.write(contents.strip() + '\n')
  21. data = []
  22. for codePoint in range(0x000000, 0x10FFFF + 1):
  23. # Skip non-scalar values.
  24. if codePoint >= 0xD800 and codePoint <= 0xDFFF:
  25. continue
  26. symbol = unisymbol(codePoint)
  27. # http://stackoverflow.com/a/17199950/96656
  28. bytes = symbol.encode('utf8').decode('latin1')
  29. data.append({
  30. 'codePoint': codePoint,
  31. 'decoded': symbol,
  32. 'encoded': bytes
  33. });
  34. jsonData = json.dumps(data, sort_keys=False, indent=2, separators=(',', ': '))
  35. # Use tabs instead of double spaces for indentation
  36. jsonData = jsonData.replace(' ', '\t')
  37. # Escape hexadecimal digits in escape sequences
  38. jsonData = re.sub(
  39. r'\\u([a-fA-F0-9]{4})',
  40. lambda match: r'\u{}'.format(match.group(1).upper()),
  41. jsonData
  42. )
  43. writeFile('data.json', jsonData)