utf8.js 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. /*! https://mths.be/utf8js v2.0.0 by @mathias */
  2. ;(function(root) {
  3. // Detect free variables `exports`
  4. var freeExports = typeof exports == 'object' && exports;
  5. // Detect free variable `module`
  6. var freeModule = typeof module == 'object' && module &&
  7. module.exports == freeExports && module;
  8. // Detect free variable `global`, from Node.js or Browserified code,
  9. // and use it as `root`
  10. var freeGlobal = typeof global == 'object' && global;
  11. if (freeGlobal.global === freeGlobal || freeGlobal.window === freeGlobal) {
  12. root = freeGlobal;
  13. }
  14. /*--------------------------------------------------------------------------*/
  15. var stringFromCharCode = String.fromCharCode;
  16. // Taken from https://mths.be/punycode
  17. function ucs2decode(string) {
  18. var output = [];
  19. var counter = 0;
  20. var length = string.length;
  21. var value;
  22. var extra;
  23. while (counter < length) {
  24. value = string.charCodeAt(counter++);
  25. if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
  26. // high surrogate, and there is a next character
  27. extra = string.charCodeAt(counter++);
  28. if ((extra & 0xFC00) == 0xDC00) { // low surrogate
  29. output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
  30. } else {
  31. // unmatched surrogate; only append this code unit, in case the next
  32. // code unit is the high surrogate of a surrogate pair
  33. output.push(value);
  34. counter--;
  35. }
  36. } else {
  37. output.push(value);
  38. }
  39. }
  40. return output;
  41. }
  42. // Taken from https://mths.be/punycode
  43. function ucs2encode(array) {
  44. var length = array.length;
  45. var index = -1;
  46. var value;
  47. var output = '';
  48. while (++index < length) {
  49. value = array[index];
  50. if (value > 0xFFFF) {
  51. value -= 0x10000;
  52. output += stringFromCharCode(value >>> 10 & 0x3FF | 0xD800);
  53. value = 0xDC00 | value & 0x3FF;
  54. }
  55. output += stringFromCharCode(value);
  56. }
  57. return output;
  58. }
  59. function checkScalarValue(codePoint) {
  60. if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
  61. throw Error(
  62. 'Lone surrogate U+' + codePoint.toString(16).toUpperCase() +
  63. ' is not a scalar value'
  64. );
  65. }
  66. }
  67. /*--------------------------------------------------------------------------*/
  68. function createByte(codePoint, shift) {
  69. return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80);
  70. }
  71. function encodeCodePoint(codePoint) {
  72. if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence
  73. return stringFromCharCode(codePoint);
  74. }
  75. var symbol = '';
  76. if ((codePoint & 0xFFFFF800) == 0) { // 2-byte sequence
  77. symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0);
  78. }
  79. else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence
  80. checkScalarValue(codePoint);
  81. symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0);
  82. symbol += createByte(codePoint, 6);
  83. }
  84. else if ((codePoint & 0xFFE00000) == 0) { // 4-byte sequence
  85. symbol = stringFromCharCode(((codePoint >> 18) & 0x07) | 0xF0);
  86. symbol += createByte(codePoint, 12);
  87. symbol += createByte(codePoint, 6);
  88. }
  89. symbol += stringFromCharCode((codePoint & 0x3F) | 0x80);
  90. return symbol;
  91. }
  92. function utf8encode(string) {
  93. var codePoints = ucs2decode(string);
  94. var length = codePoints.length;
  95. var index = -1;
  96. var codePoint;
  97. var byteString = '';
  98. while (++index < length) {
  99. codePoint = codePoints[index];
  100. byteString += encodeCodePoint(codePoint);
  101. }
  102. return byteString;
  103. }
  104. /*--------------------------------------------------------------------------*/
  105. function readContinuationByte() {
  106. if (byteIndex >= byteCount) {
  107. throw Error('Invalid byte index');
  108. }
  109. var continuationByte = byteArray[byteIndex] & 0xFF;
  110. byteIndex++;
  111. if ((continuationByte & 0xC0) == 0x80) {
  112. return continuationByte & 0x3F;
  113. }
  114. // If we end up here, it’s not a continuation byte
  115. throw Error('Invalid continuation byte');
  116. }
  117. function decodeSymbol() {
  118. var byte1;
  119. var byte2;
  120. var byte3;
  121. var byte4;
  122. var codePoint;
  123. if (byteIndex > byteCount) {
  124. throw Error('Invalid byte index');
  125. }
  126. if (byteIndex == byteCount) {
  127. return false;
  128. }
  129. // Read first byte
  130. byte1 = byteArray[byteIndex] & 0xFF;
  131. byteIndex++;
  132. // 1-byte sequence (no continuation bytes)
  133. if ((byte1 & 0x80) == 0) {
  134. return byte1;
  135. }
  136. // 2-byte sequence
  137. if ((byte1 & 0xE0) == 0xC0) {
  138. var byte2 = readContinuationByte();
  139. codePoint = ((byte1 & 0x1F) << 6) | byte2;
  140. if (codePoint >= 0x80) {
  141. return codePoint;
  142. } else {
  143. throw Error('Invalid continuation byte');
  144. }
  145. }
  146. // 3-byte sequence (may include unpaired surrogates)
  147. if ((byte1 & 0xF0) == 0xE0) {
  148. byte2 = readContinuationByte();
  149. byte3 = readContinuationByte();
  150. codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
  151. if (codePoint >= 0x0800) {
  152. checkScalarValue(codePoint);
  153. return codePoint;
  154. } else {
  155. throw Error('Invalid continuation byte');
  156. }
  157. }
  158. // 4-byte sequence
  159. if ((byte1 & 0xF8) == 0xF0) {
  160. byte2 = readContinuationByte();
  161. byte3 = readContinuationByte();
  162. byte4 = readContinuationByte();
  163. codePoint = ((byte1 & 0x0F) << 0x12) | (byte2 << 0x0C) |
  164. (byte3 << 0x06) | byte4;
  165. if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
  166. return codePoint;
  167. }
  168. }
  169. throw Error('Invalid UTF-8 detected');
  170. }
  171. var byteArray;
  172. var byteCount;
  173. var byteIndex;
  174. function utf8decode(byteString) {
  175. byteArray = ucs2decode(byteString);
  176. byteCount = byteArray.length;
  177. byteIndex = 0;
  178. var codePoints = [];
  179. var tmp;
  180. while ((tmp = decodeSymbol()) !== false) {
  181. codePoints.push(tmp);
  182. }
  183. return ucs2encode(codePoints);
  184. }
  185. /*--------------------------------------------------------------------------*/
  186. var utf8 = {
  187. 'version': '2.0.0',
  188. 'encode': utf8encode,
  189. 'decode': utf8decode
  190. };
  191. // Some AMD build optimizers, like r.js, check for specific condition patterns
  192. // like the following:
  193. if (
  194. typeof define == 'function' &&
  195. typeof define.amd == 'object' &&
  196. define.amd
  197. ) {
  198. define(function() {
  199. return utf8;
  200. });
  201. } else if (freeExports && !freeExports.nodeType) {
  202. if (freeModule) { // in Node.js or RingoJS v0.8.0+
  203. freeModule.exports = utf8;
  204. } else { // in Narwhal or RingoJS v0.7.0-
  205. var object = {};
  206. var hasOwnProperty = object.hasOwnProperty;
  207. for (var key in utf8) {
  208. hasOwnProperty.call(utf8, key) && (freeExports[key] = utf8[key]);
  209. }
  210. }
  211. } else { // in Rhino or a web browser
  212. root.utf8 = utf8;
  213. }
  214. }(this));