You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2391 lines
63 KiB

  1. /*
  2. Modifications for better node.js integration:
  3. Copyright 2014 Brian White. All rights reserved.
  4. Permission is hereby granted, free of charge, to any person obtaining a copy
  5. of this software and associated documentation files (the "Software"), to
  6. deal in the Software without restriction, including without limitation the
  7. rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  8. sell copies of the Software, and to permit persons to whom the Software is
  9. furnished to do so, subject to the following conditions:
  10. The above copyright notice and this permission notice shall be included in
  11. all copies or substantial portions of the Software.
  12. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  17. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  18. IN THE SOFTWARE.
  19. */
  20. /*
  21. Original source code:
  22. Copyright 2014 Joshua Bell
  23. Licensed under the Apache License, Version 2.0 (the "License");
  24. you may not use this file except in compliance with the License.
  25. You may obtain a copy of the License at
  26. http://www.apache.org/licenses/LICENSE-2.0
  27. Unless required by applicable law or agreed to in writing, software
  28. distributed under the License is distributed on an "AS IS" BASIS,
  29. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  30. See the License for the specific language governing permissions and
  31. limitations under the License.
  32. */
  33. //
  34. // Utilities
  35. //
  36. /**
  37. * @param {number} a The number to test.
  38. * @param {number} min The minimum value in the range, inclusive.
  39. * @param {number} max The maximum value in the range, inclusive.
  40. * @return {boolean} True if a >= min and a <= max.
  41. */
  42. function inRange(a, min, max) {
  43. return min <= a && a <= max;
  44. }
  45. /**
  46. * @param {number} n The numerator.
  47. * @param {number} d The denominator.
  48. * @return {number} The result of the integer division of n by d.
  49. */
  50. function div(n, d) {
  51. return Math.floor(n / d);
  52. }
  53. //
  54. // Implementation of Encoding specification
  55. // http://dvcs.w3.org/hg/encoding/raw-file/tip/Overview.html
  56. //
  57. //
  58. // 3. Terminology
  59. //
  60. //
  61. // 4. Encodings
  62. //
  63. /** @const */ var EOF_byte = -1;
  64. /** @const */ var EOF_code_point = -1;
  65. /**
  66. * @constructor
  67. * @param {Buffer} bytes Array of bytes that provide the stream.
  68. */
  69. function ByteInputStream(bytes) {
  70. /** @type {number} */
  71. var pos = 0;
  72. /**
  73. * @this {ByteInputStream}
  74. * @return {number} Get the next byte from the stream.
  75. */
  76. this.get = function() {
  77. return (pos >= bytes.length) ? EOF_byte : Number(bytes[pos]);
  78. };
  79. /** @param {number} n Number (positive or negative) by which to
  80. * offset the byte pointer. */
  81. this.offset = function(n) {
  82. pos += n;
  83. if (pos < 0) {
  84. throw new Error('Seeking past start of the buffer');
  85. }
  86. if (pos > bytes.length) {
  87. throw new Error('Seeking past EOF');
  88. }
  89. };
  90. /**
  91. * @param {Array.<number>} test Array of bytes to compare against.
  92. * @return {boolean} True if the start of the stream matches the test
  93. * bytes.
  94. */
  95. this.match = function(test) {
  96. if (test.length > pos + bytes.length) {
  97. return false;
  98. }
  99. var i;
  100. for (i = 0; i < test.length; i += 1) {
  101. if (Number(bytes[pos + i]) !== test[i]) {
  102. return false;
  103. }
  104. }
  105. return true;
  106. };
  107. }
  108. /**
  109. * @constructor
  110. * @param {Array.<number>} bytes The array to write bytes into.
  111. */
  112. function ByteOutputStream(bytes) {
  113. /** @type {number} */
  114. var pos = 0;
  115. /**
  116. * @param {...number} var_args The byte or bytes to emit into the stream.
  117. * @return {number} The last byte emitted.
  118. */
  119. this.emit = function(var_args) {
  120. /** @type {number} */
  121. var last = EOF_byte;
  122. var i;
  123. for (i = 0; i < arguments.length; ++i) {
  124. last = Number(arguments[i]);
  125. bytes[pos++] = last;
  126. }
  127. return last;
  128. };
  129. }
  130. /**
  131. * @constructor
  132. * @param {string} string The source of code units for the stream.
  133. */
  134. function CodePointInputStream(string) {
  135. /**
  136. * @param {string} string Input string of UTF-16 code units.
  137. * @return {Array.<number>} Code points.
  138. */
  139. function stringToCodePoints(string) {
  140. /** @type {Array.<number>} */
  141. var cps = [];
  142. // Based on http://www.w3.org/TR/WebIDL/#idl-DOMString
  143. var i = 0, n = string.length;
  144. while (i < string.length) {
  145. var c = string.charCodeAt(i);
  146. if (!inRange(c, 0xD800, 0xDFFF)) {
  147. cps.push(c);
  148. } else if (inRange(c, 0xDC00, 0xDFFF)) {
  149. cps.push(0xFFFD);
  150. } else { // (inRange(cu, 0xD800, 0xDBFF))
  151. if (i === n - 1) {
  152. cps.push(0xFFFD);
  153. } else {
  154. var d = string.charCodeAt(i + 1);
  155. if (inRange(d, 0xDC00, 0xDFFF)) {
  156. var a = c & 0x3FF;
  157. var b = d & 0x3FF;
  158. i += 1;
  159. cps.push(0x10000 + (a << 10) + b);
  160. } else {
  161. cps.push(0xFFFD);
  162. }
  163. }
  164. }
  165. i += 1;
  166. }
  167. return cps;
  168. }
  169. /** @type {number} */
  170. var pos = 0;
  171. /** @type {Array.<number>} */
  172. var cps = stringToCodePoints(string);
  173. /** @param {number} n The number of bytes (positive or negative)
  174. * to advance the code point pointer by.*/
  175. this.offset = function(n) {
  176. pos += n;
  177. if (pos < 0) {
  178. throw new Error('Seeking past start of the buffer');
  179. }
  180. if (pos > cps.length) {
  181. throw new Error('Seeking past EOF');
  182. }
  183. };
  184. /** @return {number} Get the next code point from the stream. */
  185. this.get = function() {
  186. if (pos >= cps.length) {
  187. return EOF_code_point;
  188. }
  189. return cps[pos];
  190. };
  191. }
  192. /**
  193. * @constructor
  194. */
  195. function CodePointOutputStream() {
  196. /** @type {string} */
  197. var string = '';
  198. /** @return {string} The accumulated string. */
  199. this.string = function() {
  200. return string;
  201. };
  202. /** @param {number} c The code point to encode into the stream. */
  203. this.emit = function(c) {
  204. if (c <= 0xFFFF) {
  205. string += String.fromCharCode(c);
  206. } else {
  207. c -= 0x10000;
  208. string += String.fromCharCode(0xD800 + ((c >> 10) & 0x3ff));
  209. string += String.fromCharCode(0xDC00 + (c & 0x3ff));
  210. }
  211. };
  212. }
  213. /**
  214. * @constructor
  215. * @param {string} message Description of the error.
  216. */
  217. function EncodingError(message) {
  218. this.name = 'EncodingError';
  219. this.message = message;
  220. this.code = 0;
  221. }
  222. EncodingError.prototype = Error.prototype;
  223. /**
  224. * @param {boolean} fatal If true, decoding errors raise an exception.
  225. * @param {number=} opt_code_point Override the standard fallback code point.
  226. * @return {number} The code point to insert on a decoding error.
  227. */
  228. function decoderError(fatal, opt_code_point) {
  229. if (fatal) {
  230. throw new EncodingError('Decoder error');
  231. }
  232. return opt_code_point || 0xFFFD;
  233. }
  234. /**
  235. * @param {number} code_point The code point that could not be encoded.
  236. * @return {number} Always throws, no value is actually returned.
  237. */
  238. function encoderError(code_point) {
  239. throw new EncodingError('The code point ' + code_point +
  240. ' could not be encoded.');
  241. }
  242. /**
  243. * @param {string} label The encoding label.
  244. * @return {?{name:string,labels:Array.<string>}}
  245. */
  246. function getEncoding(label) {
  247. label = String(label).trim().toLowerCase();
  248. if (Object.prototype.hasOwnProperty.call(label_to_encoding, label)) {
  249. return label_to_encoding[label];
  250. }
  251. return null;
  252. }
  253. /** @type {Array.<{encodings: Array.<{name:string,labels:Array.<string>}>,
  254. * heading: string}>} */
  255. var encodings = [
  256. {
  257. "encodings": [
  258. {
  259. "labels": [
  260. "unicode-1-1-utf-8",
  261. "utf-8",
  262. "utf8"
  263. ],
  264. "name": "utf-8"
  265. }
  266. ],
  267. "heading": "The Encoding"
  268. },
  269. {
  270. "encodings": [
  271. {
  272. "labels": [
  273. "864",
  274. "cp864",
  275. "csibm864",
  276. "ibm864"
  277. ],
  278. "name": "ibm864"
  279. },
  280. {
  281. "labels": [
  282. "866",
  283. "cp866",
  284. "csibm866",
  285. "ibm866"
  286. ],
  287. "name": "ibm866"
  288. },
  289. {
  290. "labels": [
  291. "csisolatin2",
  292. "iso-8859-2",
  293. "iso-ir-101",
  294. "iso8859-2",
  295. "iso88592",
  296. "iso_8859-2",
  297. "iso_8859-2:1987",
  298. "l2",
  299. "latin2"
  300. ],
  301. "name": "iso-8859-2"
  302. },
  303. {
  304. "labels": [
  305. "csisolatin3",
  306. "iso-8859-3",
  307. "iso-ir-109",
  308. "iso8859-3",
  309. "iso88593",
  310. "iso_8859-3",
  311. "iso_8859-3:1988",
  312. "l3",
  313. "latin3"
  314. ],
  315. "name": "iso-8859-3"
  316. },
  317. {
  318. "labels": [
  319. "csisolatin4",
  320. "iso-8859-4",
  321. "iso-ir-110",
  322. "iso8859-4",
  323. "iso88594",
  324. "iso_8859-4",
  325. "iso_8859-4:1988",
  326. "l4",
  327. "latin4"
  328. ],
  329. "name": "iso-8859-4"
  330. },
  331. {
  332. "labels": [
  333. "csisolatincyrillic",
  334. "cyrillic",
  335. "iso-8859-5",
  336. "iso-ir-144",
  337. "iso8859-5",
  338. "iso88595",
  339. "iso_8859-5",
  340. "iso_8859-5:1988"
  341. ],
  342. "name": "iso-8859-5"
  343. },
  344. {
  345. "labels": [
  346. "arabic",
  347. "asmo-708",
  348. "csiso88596e",
  349. "csiso88596i",
  350. "csisolatinarabic",
  351. "ecma-114",
  352. "iso-8859-6",
  353. "iso-8859-6-e",
  354. "iso-8859-6-i",
  355. "iso-ir-127",
  356. "iso8859-6",
  357. "iso88596",
  358. "iso_8859-6",
  359. "iso_8859-6:1987"
  360. ],
  361. "name": "iso-8859-6"
  362. },
  363. {
  364. "labels": [
  365. "csisolatingreek",
  366. "ecma-118",
  367. "elot_928",
  368. "greek",
  369. "greek8",
  370. "iso-8859-7",
  371. "iso-ir-126",
  372. "iso8859-7",
  373. "iso88597",
  374. "iso_8859-7",
  375. "iso_8859-7:1987",
  376. "sun_eu_greek"
  377. ],
  378. "name": "iso-8859-7"
  379. },
  380. {
  381. "labels": [
  382. "csiso88598e",
  383. "csisolatinhebrew",
  384. "hebrew",
  385. "iso-8859-8",
  386. "iso-8859-8-e",
  387. "iso-ir-138",
  388. "iso8859-8",
  389. "iso88598",
  390. "iso_8859-8",
  391. "iso_8859-8:1988",
  392. "visual"
  393. ],
  394. "name": "iso-8859-8"
  395. },
  396. {
  397. "labels": [
  398. "csiso88598i",
  399. "iso-8859-8-i",
  400. "logical"
  401. ],
  402. "name": "iso-8859-8-i"
  403. },
  404. {
  405. "labels": [
  406. "csisolatin6",
  407. "iso-8859-10",
  408. "iso-ir-157",
  409. "iso8859-10",
  410. "iso885910",
  411. "l6",
  412. "latin6"
  413. ],
  414. "name": "iso-8859-10"
  415. },
  416. {
  417. "labels": [
  418. "iso-8859-13",
  419. "iso8859-13",
  420. "iso885913"
  421. ],
  422. "name": "iso-8859-13"
  423. },
  424. {
  425. "labels": [
  426. "iso-8859-14",
  427. "iso8859-14",
  428. "iso885914"
  429. ],
  430. "name": "iso-8859-14"
  431. },
  432. {
  433. "labels": [
  434. "csisolatin9",
  435. "iso-8859-15",
  436. "iso8859-15",
  437. "iso885915",
  438. "iso_8859-15",
  439. "l9"
  440. ],
  441. "name": "iso-8859-15"
  442. },
  443. {
  444. "labels": [
  445. "iso-8859-16"
  446. ],
  447. "name": "iso-8859-16"
  448. },
  449. {
  450. "labels": [
  451. "cskoi8r",
  452. "koi",
  453. "koi8",
  454. "koi8-r",
  455. "koi8_r"
  456. ],
  457. "name": "koi8-r"
  458. },
  459. {
  460. "labels": [
  461. "koi8-u"
  462. ],
  463. "name": "koi8-u"
  464. },
  465. {
  466. "labels": [
  467. "csmacintosh",
  468. "mac",
  469. "macintosh",
  470. "x-mac-roman"
  471. ],
  472. "name": "macintosh"
  473. },
  474. {
  475. "labels": [
  476. "dos-874",
  477. "iso-8859-11",
  478. "iso8859-11",
  479. "iso885911",
  480. "tis-620",
  481. "windows-874"
  482. ],
  483. "name": "windows-874"
  484. },
  485. {
  486. "labels": [
  487. "cp1250",
  488. "windows-1250",
  489. "x-cp1250"
  490. ],
  491. "name": "windows-1250"
  492. },
  493. {
  494. "labels": [
  495. "cp1251",
  496. "windows-1251",
  497. "x-cp1251"
  498. ],
  499. "name": "windows-1251"
  500. },
  501. {
  502. "labels": [
  503. "ansi_x3.4-1968",
  504. "ascii",
  505. "cp1252",
  506. "cp819",
  507. "csisolatin1",
  508. "ibm819",
  509. "iso-8859-1",
  510. "iso-ir-100",
  511. "iso8859-1",
  512. "iso88591",
  513. "iso_8859-1",
  514. "iso_8859-1:1987",
  515. "l1",
  516. "latin1",
  517. "us-ascii",
  518. "windows-1252",
  519. "x-cp1252"
  520. ],
  521. "name": "windows-1252"
  522. },
  523. {
  524. "labels": [
  525. "cp1253",
  526. "windows-1253",
  527. "x-cp1253"
  528. ],
  529. "name": "windows-1253"
  530. },
  531. {
  532. "labels": [
  533. "cp1254",
  534. "csisolatin5",
  535. "iso-8859-9",
  536. "iso-ir-148",
  537. "iso8859-9",
  538. "iso88599",
  539. "iso_8859-9",
  540. "iso_8859-9:1989",
  541. "l5",
  542. "latin5",
  543. "windows-1254",
  544. "x-cp1254"
  545. ],
  546. "name": "windows-1254"
  547. },
  548. {
  549. "labels": [
  550. "cp1255",
  551. "windows-1255",
  552. "x-cp1255"
  553. ],
  554. "name": "windows-1255"
  555. },
  556. {
  557. "labels": [
  558. "cp1256",
  559. "windows-1256",
  560. "x-cp1256"
  561. ],
  562. "name": "windows-1256"
  563. },
  564. {
  565. "labels": [
  566. "cp1257",
  567. "windows-1257",
  568. "x-cp1257"
  569. ],
  570. "name": "windows-1257"
  571. },
  572. {
  573. "labels": [
  574. "cp1258",
  575. "windows-1258",
  576. "x-cp1258"
  577. ],
  578. "name": "windows-1258"
  579. },
  580. {
  581. "labels": [
  582. "x-mac-cyrillic",
  583. "x-mac-ukrainian"
  584. ],
  585. "name": "x-mac-cyrillic"
  586. }
  587. ],
  588. "heading": "Legacy single-byte encodings"
  589. },
  590. {
  591. "encodings": [
  592. {
  593. "labels": [
  594. "chinese",
  595. "csgb2312",
  596. "csiso58gb231280",
  597. "gb2312",
  598. "gb_2312",
  599. "gb_2312-80",
  600. "gbk",
  601. "iso-ir-58",
  602. "x-gbk"
  603. ],
  604. "name": "gbk"
  605. },
  606. {
  607. "labels": [
  608. "gb18030"
  609. ],
  610. "name": "gb18030"
  611. },
  612. {
  613. "labels": [
  614. "hz-gb-2312"
  615. ],
  616. "name": "hz-gb-2312"
  617. }
  618. ],
  619. "heading": "Legacy multi-byte Chinese (simplified) encodings"
  620. },
  621. {
  622. "encodings": [
  623. {
  624. "labels": [
  625. "big5",
  626. "big5-hkscs",
  627. "cn-big5",
  628. "csbig5",
  629. "x-x-big5"
  630. ],
  631. "name": "big5"
  632. }
  633. ],
  634. "heading": "Legacy multi-byte Chinese (traditional) encodings"
  635. },
  636. {
  637. "encodings": [
  638. {
  639. "labels": [
  640. "cseucpkdfmtjapanese",
  641. "euc-jp",
  642. "x-euc-jp"
  643. ],
  644. "name": "euc-jp"
  645. },
  646. {
  647. "labels": [
  648. "csiso2022jp",
  649. "iso-2022-jp"
  650. ],
  651. "name": "iso-2022-jp"
  652. },
  653. {
  654. "labels": [
  655. "csshiftjis",
  656. "ms_kanji",
  657. "shift-jis",
  658. "shift_jis",
  659. "sjis",
  660. "windows-31j",
  661. "x-sjis"
  662. ],
  663. "name": "shift_jis"
  664. }
  665. ],
  666. "heading": "Legacy multi-byte Japanese encodings"
  667. },
  668. {
  669. "encodings": [
  670. {
  671. "labels": [
  672. "cseuckr",
  673. "csksc56011987",
  674. "euc-kr",
  675. "iso-ir-149",
  676. "korean",
  677. "ks_c_5601-1987",
  678. "ks_c_5601-1989",
  679. "ksc5601",
  680. "ksc_5601",
  681. "windows-949"
  682. ],
  683. "name": "euc-kr"
  684. }
  685. ],
  686. "heading": "Legacy multi-byte Korean encodings"
  687. },
  688. {
  689. "encodings": [
  690. {
  691. "labels": [
  692. "csiso2022kr",
  693. "iso-2022-cn",
  694. "iso-2022-cn-ext",
  695. "iso-2022-kr"
  696. ],
  697. "name": "replacement"
  698. },
  699. {
  700. "labels": [
  701. "utf-16be"
  702. ],
  703. "name": "utf-16be"
  704. },
  705. {
  706. "labels": [
  707. "utf-16",
  708. "utf-16le"
  709. ],
  710. "name": "utf-16le"
  711. },
  712. {
  713. "labels": [
  714. "x-user-defined"
  715. ],
  716. "name": "x-user-defined"
  717. }
  718. ],
  719. "heading": "Legacy miscellaneous encodings"
  720. }
  721. ];
  722. var name_to_encoding = {};
  723. var label_to_encoding = {};
  724. encodings.forEach(function(category) {
  725. category.encodings.forEach(function(encoding) {
  726. name_to_encoding[encoding.name] = encoding;
  727. encoding.labels.forEach(function(label) {
  728. label_to_encoding[label] = encoding;
  729. });
  730. });
  731. });
  732. //
  733. // 5. Indexes
  734. //
  735. /**
  736. * @param {number} pointer The |pointer| to search for.
  737. * @param {Array.<?number>|undefined} index The |index| to search within.
  738. * @return {?number} The code point corresponding to |pointer| in |index|,
  739. * or null if |code point| is not in |index|.
  740. */
  741. function indexCodePointFor(pointer, index) {
  742. if (!index) return null;
  743. return index[pointer] || null;
  744. }
  745. /**
  746. * @param {number} code_point The |code point| to search for.
  747. * @param {Array.<?number>} index The |index| to search within.
  748. * @return {?number} The first pointer corresponding to |code point| in
  749. * |index|, or null if |code point| is not in |index|.
  750. */
  751. function indexPointerFor(code_point, index) {
  752. var pointer = index.indexOf(code_point);
  753. return pointer === -1 ? null : pointer;
  754. }
  755. /** @type {Object.<string, (Array.<number>|Array.<Array.<number>>)>} */
  756. var indexes = require('./encoding-indexes');
  757. /**
  758. * @param {number} pointer The |pointer| to search for in the gb18030 index.
  759. * @return {?number} The code point corresponding to |pointer| in |index|,
  760. * or null if |code point| is not in the gb18030 index.
  761. */
  762. function indexGB18030CodePointFor(pointer) {
  763. if ((pointer > 39419 && pointer < 189000) || (pointer > 1237575)) {
  764. return null;
  765. }
  766. var /** @type {number} */ offset = 0,
  767. /** @type {number} */ code_point_offset = 0,
  768. /** @type {Array.<Array.<number>>} */ idx = indexes['gb18030'];
  769. var i;
  770. for (i = 0; i < idx.length; ++i) {
  771. var entry = idx[i];
  772. if (entry[0] <= pointer) {
  773. offset = entry[0];
  774. code_point_offset = entry[1];
  775. } else {
  776. break;
  777. }
  778. }
  779. return code_point_offset + pointer - offset;
  780. }
  781. /**
  782. * @param {number} code_point The |code point| to locate in the gb18030 index.
  783. * @return {number} The first pointer corresponding to |code point| in the
  784. * gb18030 index.
  785. */
  786. function indexGB18030PointerFor(code_point) {
  787. var /** @type {number} */ offset = 0,
  788. /** @type {number} */ pointer_offset = 0,
  789. /** @type {Array.<Array.<number>>} */ idx = indexes['gb18030'];
  790. var i;
  791. for (i = 0; i < idx.length; ++i) {
  792. var entry = idx[i];
  793. if (entry[1] <= code_point) {
  794. offset = entry[1];
  795. pointer_offset = entry[0];
  796. } else {
  797. break;
  798. }
  799. }
  800. return pointer_offset + code_point - offset;
  801. }
  802. //
  803. // 7. API
  804. //
  805. /** @const */ var DEFAULT_ENCODING = 'utf-8';
  806. // 7.1 Interface TextDecoder
  807. /**
  808. * @constructor
  809. * @param {string=} opt_encoding The label of the encoding;
  810. * defaults to 'utf-8'.
  811. * @param {{fatal: boolean}=} options
  812. */
  813. function TextDecoder(opt_encoding, options) {
  814. if (!(this instanceof TextDecoder)) {
  815. return new TextDecoder(opt_encoding, options);
  816. }
  817. opt_encoding = opt_encoding ? String(opt_encoding) : DEFAULT_ENCODING;
  818. options = Object(options);
  819. /** @private */
  820. this._encoding = getEncoding(opt_encoding);
  821. if (this._encoding === null || this._encoding.name === 'replacement')
  822. throw new TypeError('Unknown encoding: ' + opt_encoding);
  823. /** @private @type {boolean} */
  824. this._streaming = false;
  825. /** @private @type {boolean} */
  826. this._BOMseen = false;
  827. /** @private */
  828. this._decoder = null;
  829. /** @private @type {{fatal: boolean}=} */
  830. this._options = { fatal: Boolean(options.fatal) };
  831. if (Object.defineProperty) {
  832. Object.defineProperty(
  833. this, 'encoding',
  834. { get: function() { return this._encoding.name; } });
  835. } else {
  836. this.encoding = this._encoding.name;
  837. }
  838. return this;
  839. }
  840. // TODO: Issue if input byte stream is offset by decoder
  841. // TODO: BOM detection will not work if stream header spans multiple calls
  842. // (last N bytes of previous stream may need to be retained?)
  843. TextDecoder.prototype = {
  844. /**
  845. * @param {Buffer=} bytes The buffer of bytes to decode.
  846. * @param {{stream: boolean}=} options
  847. */
  848. decode: function decode(bytes, options) {
  849. options = Object(options);
  850. if (!this._streaming) {
  851. this._decoder = this._encoding.getDecoder(this._options);
  852. this._BOMseen = false;
  853. }
  854. this._streaming = Boolean(options.stream);
  855. var input_stream = new ByteInputStream(bytes);
  856. var output_stream = new CodePointOutputStream();
  857. /** @type {number} */
  858. var code_point;
  859. while (input_stream.get() !== EOF_byte) {
  860. code_point = this._decoder.decode(input_stream);
  861. if (code_point !== null && code_point !== EOF_code_point) {
  862. output_stream.emit(code_point);
  863. }
  864. }
  865. if (!this._streaming) {
  866. do {
  867. code_point = this._decoder.decode(input_stream);
  868. if (code_point !== null && code_point !== EOF_code_point) {
  869. output_stream.emit(code_point);
  870. }
  871. } while (code_point !== EOF_code_point &&
  872. input_stream.get() != EOF_byte);
  873. this._decoder = null;
  874. }
  875. var result = output_stream.string();
  876. if (!this._BOMseen && result.length) {
  877. this._BOMseen = true;
  878. if (UTFs.indexOf(this.encoding) !== -1 &&
  879. result.charCodeAt(0) === 0xFEFF) {
  880. result = result.substring(1);
  881. }
  882. }
  883. return result;
  884. }
  885. };
  886. var UTFs = ['utf-8', 'utf-16le', 'utf-16be'];
  887. // 7.2 Interface TextEncoder
  888. /**
  889. * @constructor
  890. * @param {string=} opt_encoding The label of the encoding;
  891. * defaults to 'utf-8'.
  892. * @param {{fatal: boolean}=} options
  893. */
  894. function TextEncoder(opt_encoding, options) {
  895. if (!(this instanceof TextEncoder)) {
  896. return new TextEncoder(opt_encoding, options);
  897. }
  898. opt_encoding = opt_encoding ? String(opt_encoding) : DEFAULT_ENCODING;
  899. options = Object(options);
  900. /** @private */
  901. this._encoding = getEncoding(opt_encoding);
  902. if (this._encoding === null || (this._encoding.name !== 'utf-8' &&
  903. this._encoding.name !== 'utf-16le' &&
  904. this._encoding.name !== 'utf-16be'))
  905. throw new TypeError('Unknown encoding: ' + opt_encoding);
  906. /** @private @type {boolean} */
  907. this._streaming = false;
  908. /** @private */
  909. this._encoder = null;
  910. /** @private @type {{fatal: boolean}=} */
  911. this._options = { fatal: Boolean(options.fatal) };
  912. if (Object.defineProperty) {
  913. Object.defineProperty(
  914. this, 'encoding',
  915. { get: function() { return this._encoding.name; } });
  916. } else {
  917. this.encoding = this._encoding.name;
  918. }
  919. return this;
  920. }
  921. TextEncoder.prototype = {
  922. /**
  923. * @param {string=} opt_string The string to encode.
  924. * @param {{stream: boolean}=} options
  925. */
  926. encode: function encode(opt_string, options) {
  927. opt_string = opt_string ? String(opt_string) : '';
  928. options = Object(options);
  929. // TODO: any options?
  930. if (!this._streaming) {
  931. this._encoder = this._encoding.getEncoder(this._options);
  932. }
  933. this._streaming = Boolean(options.stream);
  934. var bytes = [];
  935. var output_stream = new ByteOutputStream(bytes);
  936. var input_stream = new CodePointInputStream(opt_string);
  937. while (input_stream.get() !== EOF_code_point) {
  938. this._encoder.encode(output_stream, input_stream);
  939. }
  940. if (!this._streaming) {
  941. /** @type {number} */
  942. var last_byte;
  943. do {
  944. last_byte = this._encoder.encode(output_stream, input_stream);
  945. } while (last_byte !== EOF_byte);
  946. this._encoder = null;
  947. }
  948. return new Buffer(bytes);
  949. }
  950. };
  951. //
  952. // 8. The encoding
  953. //
  954. // 8.1 utf-8
  955. /**
  956. * @constructor
  957. * @param {{fatal: boolean}} options
  958. */
  959. function UTF8Decoder(options) {
  960. var fatal = options.fatal;
  961. var /** @type {number} */ utf8_code_point = 0,
  962. /** @type {number} */ utf8_bytes_needed = 0,
  963. /** @type {number} */ utf8_bytes_seen = 0,
  964. /** @type {number} */ utf8_lower_boundary = 0;
  965. /**
  966. * @param {ByteInputStream} byte_pointer The byte stream to decode.
  967. * @return {?number} The next code point decoded, or null if not enough
  968. * data exists in the input stream to decode a complete code point.
  969. */
  970. this.decode = function(byte_pointer) {
  971. var bite = byte_pointer.get();
  972. if (bite === EOF_byte) {
  973. if (utf8_bytes_needed !== 0) {
  974. return decoderError(fatal);
  975. }
  976. return EOF_code_point;
  977. }
  978. byte_pointer.offset(1);
  979. if (utf8_bytes_needed === 0) {
  980. if (inRange(bite, 0x00, 0x7F)) {
  981. return bite;
  982. }
  983. if (inRange(bite, 0xC2, 0xDF)) {
  984. utf8_bytes_needed = 1;
  985. utf8_lower_boundary = 0x80;
  986. utf8_code_point = bite - 0xC0;
  987. } else if (inRange(bite, 0xE0, 0xEF)) {
  988. utf8_bytes_needed = 2;
  989. utf8_lower_boundary = 0x800;
  990. utf8_code_point = bite - 0xE0;
  991. } else if (inRange(bite, 0xF0, 0xF4)) {
  992. utf8_bytes_needed = 3;
  993. utf8_lower_boundary = 0x10000;
  994. utf8_code_point = bite - 0xF0;
  995. } else {
  996. return decoderError(fatal);
  997. }
  998. utf8_code_point = utf8_code_point * Math.pow(64, utf8_bytes_needed);
  999. return null;
  1000. }
  1001. if (!inRange(bite, 0x80, 0xBF)) {
  1002. utf8_code_point = 0;
  1003. utf8_bytes_needed = 0;
  1004. utf8_bytes_seen = 0;
  1005. utf8_lower_boundary = 0;
  1006. byte_pointer.offset(-1);
  1007. return decoderError(fatal);
  1008. }
  1009. utf8_bytes_seen += 1;
  1010. utf8_code_point = utf8_code_point + (bite - 0x80) *
  1011. Math.pow(64, utf8_bytes_needed - utf8_bytes_seen);
  1012. if (utf8_bytes_seen !== utf8_bytes_needed) {
  1013. return null;
  1014. }
  1015. var code_point = utf8_code_point;
  1016. var lower_boundary = utf8_lower_boundary;
  1017. utf8_code_point = 0;
  1018. utf8_bytes_needed = 0;
  1019. utf8_bytes_seen = 0;
  1020. utf8_lower_boundary = 0;
  1021. if (inRange(code_point, lower_boundary, 0x10FFFF) &&
  1022. !inRange(code_point, 0xD800, 0xDFFF)) {
  1023. return code_point;
  1024. }
  1025. return decoderError(fatal);
  1026. };
  1027. }
  1028. /**
  1029. * @constructor
  1030. * @param {{fatal: boolean}} options
  1031. */
  1032. function UTF8Encoder(options) {
  1033. var fatal = options.fatal;
  1034. /**
  1035. * @param {ByteOutputStream} output_byte_stream Output byte stream.
  1036. * @param {CodePointInputStream} code_point_pointer Input stream.
  1037. * @return {number} The last byte emitted.
  1038. */
  1039. this.encode = function(output_byte_stream, code_point_pointer) {
  1040. /** @type {number} */
  1041. var code_point = code_point_pointer.get();
  1042. if (code_point === EOF_code_point) {
  1043. return EOF_byte;
  1044. }
  1045. code_point_pointer.offset(1);
  1046. if (inRange(code_point, 0xD800, 0xDFFF)) {
  1047. return encoderError(code_point);
  1048. }
  1049. if (inRange(code_point, 0x0000, 0x007f)) {
  1050. return output_byte_stream.emit(code_point);
  1051. }
  1052. var count, offset;
  1053. if (inRange(code_point, 0x0080, 0x07FF)) {
  1054. count = 1;
  1055. offset = 0xC0;
  1056. } else if (inRange(code_point, 0x0800, 0xFFFF)) {
  1057. count = 2;
  1058. offset = 0xE0;
  1059. } else if (inRange(code_point, 0x10000, 0x10FFFF)) {
  1060. count = 3;
  1061. offset = 0xF0;
  1062. }
  1063. var result = output_byte_stream.emit(
  1064. div(code_point, Math.pow(64, count)) + offset);
  1065. while (count > 0) {
  1066. var temp = div(code_point, Math.pow(64, count - 1));
  1067. result = output_byte_stream.emit(0x80 + (temp % 64));
  1068. count -= 1;
  1069. }
  1070. return result;
  1071. };
  1072. }
  1073. /** @param {{fatal: boolean}} options */
  1074. name_to_encoding['utf-8'].getEncoder = function(options) {
  1075. return new UTF8Encoder(options);
  1076. };
  1077. /** @param {{fatal: boolean}} options */
  1078. name_to_encoding['utf-8'].getDecoder = function(options) {
  1079. return new UTF8Decoder(options);
  1080. };
  1081. //
  1082. // 9. Legacy single-byte encodings
  1083. //
  1084. /**
  1085. * @constructor
  1086. * @param {Array.<number>} index The encoding index.
  1087. * @param {{fatal: boolean}} options
  1088. */
  1089. function SingleByteDecoder(index, options) {
  1090. var fatal = options.fatal;
  1091. /**
  1092. * @param {ByteInputStream} byte_pointer The byte stream to decode.
  1093. * @return {?number} The next code point decoded, or null if not enough
  1094. * data exists in the input stream to decode a complete code point.
  1095. */
  1096. this.decode = function(byte_pointer) {
  1097. var bite = byte_pointer.get();
  1098. if (bite === EOF_byte) {
  1099. return EOF_code_point;
  1100. }
  1101. byte_pointer.offset(1);
  1102. if (inRange(bite, 0x00, 0x7F)) {
  1103. return bite;
  1104. }
  1105. var code_point = index[bite - 0x80];
  1106. if (code_point === null) {
  1107. return decoderError(fatal);
  1108. }
  1109. return code_point;
  1110. };
  1111. }
  1112. /**
  1113. * @constructor
  1114. * @param {Array.<?number>} index The encoding index.
  1115. * @param {{fatal: boolean}} options
  1116. */
  1117. function SingleByteEncoder(index, options) {
  1118. var fatal = options.fatal;
  1119. /**
  1120. * @param {ByteOutputStream} output_byte_stream Output byte stream.
  1121. * @param {CodePointInputStream} code_point_pointer Input stream.
  1122. * @return {number} The last byte emitted.
  1123. */
  1124. this.encode = function(output_byte_stream, code_point_pointer) {
  1125. var code_point = code_point_pointer.get();
  1126. if (code_point === EOF_code_point) {
  1127. return EOF_byte;
  1128. }
  1129. code_point_pointer.offset(1);
  1130. if (inRange(code_point, 0x0000, 0x007F)) {
  1131. return output_byte_stream.emit(code_point);
  1132. }
  1133. var pointer = indexPointerFor(code_point, index);
  1134. if (pointer === null) {
  1135. encoderError(code_point);
  1136. }
  1137. return output_byte_stream.emit(pointer + 0x80);
  1138. };
  1139. }
  1140. (function() {
  1141. encodings.forEach(function(category) {
  1142. if (category.heading !== 'Legacy single-byte encodings')
  1143. return;
  1144. category.encodings.forEach(function(encoding) {
  1145. var idx = indexes[encoding.name];
  1146. /** @param {{fatal: boolean}} options */
  1147. encoding.getDecoder = function(options) {
  1148. return new SingleByteDecoder(idx, options);
  1149. };
  1150. /** @param {{fatal: boolean}} options */
  1151. encoding.getEncoder = function(options) {
  1152. return new SingleByteEncoder(idx, options);
  1153. };
  1154. });
  1155. });
  1156. }());
  1157. //
  1158. // 10. Legacy multi-byte Chinese (simplified) encodings
  1159. //
  1160. // 9.1 gbk
  1161. /**
  1162. * @constructor
  1163. * @param {boolean} gb18030 True if decoding gb18030, false otherwise.
  1164. * @param {{fatal: boolean}} options
  1165. */
  1166. function GBKDecoder(gb18030, options) {
  1167. var fatal = options.fatal;
  1168. var /** @type {number} */ gbk_first = 0x00,
  1169. /** @type {number} */ gbk_second = 0x00,
  1170. /** @type {number} */ gbk_third = 0x00;
  1171. /**
  1172. * @param {ByteInputStream} byte_pointer The byte stream to decode.
  1173. * @return {?number} The next code point decoded, or null if not enough
  1174. * data exists in the input stream to decode a complete code point.
  1175. */
  1176. this.decode = function(byte_pointer) {
  1177. var bite = byte_pointer.get();
  1178. if (bite === EOF_byte && gbk_first === 0x00 &&
  1179. gbk_second === 0x00 && gbk_third === 0x00) {
  1180. return EOF_code_point;
  1181. }
  1182. if (bite === EOF_byte &&
  1183. (gbk_first !== 0x00 || gbk_second !== 0x00 || gbk_third !== 0x00)) {
  1184. gbk_first = 0x00;
  1185. gbk_second = 0x00;
  1186. gbk_third = 0x00;
  1187. decoderError(fatal);
  1188. }
  1189. byte_pointer.offset(1);
  1190. var code_point;
  1191. if (gbk_third !== 0x00) {
  1192. code_point = null;
  1193. if (inRange(bite, 0x30, 0x39)) {
  1194. code_point = indexGB18030CodePointFor(
  1195. (((gbk_first - 0x81) * 10 + (gbk_second - 0x30)) * 126 +
  1196. (gbk_third - 0x81)) * 10 + bite - 0x30);
  1197. }
  1198. gbk_first = 0x00;
  1199. gbk_second = 0x00;
  1200. gbk_third = 0x00;
  1201. if (code_point === null) {
  1202. byte_pointer.offset(-3);
  1203. return decoderError(fatal);
  1204. }
  1205. return code_point;
  1206. }
  1207. if (gbk_second !== 0x00) {
  1208. if (inRange(bite, 0x81, 0xFE)) {
  1209. gbk_third = bite;
  1210. return null;
  1211. }
  1212. byte_pointer.offset(-2);
  1213. gbk_first = 0x00;
  1214. gbk_second = 0x00;
  1215. return decoderError(fatal);
  1216. }
  1217. if (gbk_first !== 0x00) {
  1218. if (inRange(bite, 0x30, 0x39) && gb18030) {
  1219. gbk_second = bite;
  1220. return null;
  1221. }
  1222. var lead = gbk_first;
  1223. var pointer = null;
  1224. gbk_first = 0x00;
  1225. var offset = bite < 0x7F ? 0x40 : 0x41;
  1226. if (inRange(bite, 0x40, 0x7E) || inRange(bite, 0x80, 0xFE)) {
  1227. pointer = (lead - 0x81) * 190 + (bite - offset);
  1228. }
  1229. code_point = pointer === null ? null :
  1230. indexCodePointFor(pointer, indexes['gbk']);
  1231. if (pointer === null) {
  1232. byte_pointer.offset(-1);
  1233. }
  1234. if (code_point === null) {
  1235. return decoderError(fatal);
  1236. }
  1237. return code_point;
  1238. }
  1239. if (inRange(bite, 0x00, 0x7F)) {
  1240. return bite;
  1241. }
  1242. if (bite === 0x80) {
  1243. return 0x20AC;
  1244. }
  1245. if (inRange(bite, 0x81, 0xFE)) {
  1246. gbk_first = bite;
  1247. return null;
  1248. }
  1249. return decoderError(fatal);
  1250. };
  1251. }
  1252. /**
  1253. * @constructor
  1254. * @param {boolean} gb18030 True if decoding gb18030, false otherwise.
  1255. * @param {{fatal: boolean}} options
  1256. */
  1257. function GBKEncoder(gb18030, options) {
  1258. var fatal = options.fatal;
  1259. /**
  1260. * @param {ByteOutputStream} output_byte_stream Output byte stream.
  1261. * @param {CodePointInputStream} code_point_pointer Input stream.
  1262. * @return {number} The last byte emitted.
  1263. */
  1264. this.encode = function(output_byte_stream, code_point_pointer) {
  1265. var code_point = code_point_pointer.get();
  1266. if (code_point === EOF_code_point) {
  1267. return EOF_byte;
  1268. }
  1269. code_point_pointer.offset(1);
  1270. if (inRange(code_point, 0x0000, 0x007F)) {
  1271. return output_byte_stream.emit(code_point);
  1272. }
  1273. var pointer = indexPointerFor(code_point, indexes['gbk']);
  1274. if (pointer !== null) {
  1275. var lead = div(pointer, 190) + 0x81;
  1276. var trail = pointer % 190;
  1277. var offset = trail < 0x3F ? 0x40 : 0x41;
  1278. return output_byte_stream.emit(lead, trail + offset);
  1279. }
  1280. if (pointer === null && !gb18030) {
  1281. return encoderError(code_point);
  1282. }
  1283. pointer = indexGB18030PointerFor(code_point);
  1284. var byte1 = div(div(div(pointer, 10), 126), 10);
  1285. pointer = pointer - byte1 * 10 * 126 * 10;
  1286. var byte2 = div(div(pointer, 10), 126);
  1287. pointer = pointer - byte2 * 10 * 126;
  1288. var byte3 = div(pointer, 10);
  1289. var byte4 = pointer - byte3 * 10;
  1290. return output_byte_stream.emit(byte1 + 0x81,
  1291. byte2 + 0x30,
  1292. byte3 + 0x81,
  1293. byte4 + 0x30);
  1294. };
  1295. }
  1296. name_to_encoding['gbk'].getEncoder = function(options) {
  1297. return new GBKEncoder(false, options);
  1298. };
  1299. name_to_encoding['gbk'].getDecoder = function(options) {
  1300. return new GBKDecoder(false, options);
  1301. };
  1302. // 9.2 gb18030
  1303. name_to_encoding['gb18030'].getEncoder = function(options) {
  1304. return new GBKEncoder(true, options);
  1305. };
  1306. name_to_encoding['gb18030'].getDecoder = function(options) {
  1307. return new GBKDecoder(true, options);
  1308. };
  1309. // 10.2 hz-gb-2312
  1310. /**
  1311. * @constructor
  1312. * @param {{fatal: boolean}} options
  1313. */
  1314. function HZGB2312Decoder(options) {
  1315. var fatal = options.fatal;
  1316. var /** @type {boolean} */ hzgb2312 = false,
  1317. /** @type {number} */ hzgb2312_lead = 0x00;
  1318. /**
  1319. * @param {ByteInputStream} byte_pointer The byte stream to decode.
  1320. * @return {?number} The next code point decoded, or null if not enough
  1321. * data exists in the input stream to decode a complete code point.
  1322. */
  1323. this.decode = function(byte_pointer) {
  1324. var bite = byte_pointer.get();
  1325. if (bite === EOF_byte && hzgb2312_lead === 0x00) {
  1326. return EOF_code_point;
  1327. }
  1328. if (bite === EOF_byte && hzgb2312_lead !== 0x00) {
  1329. hzgb2312_lead = 0x00;
  1330. return decoderError(fatal);
  1331. }
  1332. byte_pointer.offset(1);
  1333. if (hzgb2312_lead === 0x7E) {
  1334. hzgb2312_lead = 0x00;
  1335. if (bite === 0x7B) {
  1336. hzgb2312 = true;
  1337. return null;
  1338. }
  1339. if (bite === 0x7D) {
  1340. hzgb2312 = false;
  1341. return null;
  1342. }
  1343. if (bite === 0x7E) {
  1344. return 0x007E;
  1345. }
  1346. if (bite === 0x0A) {
  1347. return null;
  1348. }
  1349. byte_pointer.offset(-1);
  1350. return decoderError(fatal);
  1351. }
  1352. if (hzgb2312_lead !== 0x00) {
  1353. var lead = hzgb2312_lead;
  1354. hzgb2312_lead = 0x00;
  1355. var code_point = null;
  1356. if (inRange(bite, 0x21, 0x7E)) {
  1357. code_point = indexCodePointFor((lead - 1) * 190 +
  1358. (bite + 0x3F), indexes['gbk']);
  1359. }
  1360. if (bite === 0x0A) {
  1361. hzgb2312 = false;
  1362. }
  1363. if (code_point === null) {
  1364. return decoderError(fatal);
  1365. }
  1366. return code_point;
  1367. }
  1368. if (bite === 0x7E) {
  1369. hzgb2312_lead = 0x7E;
  1370. return null;
  1371. }
  1372. if (hzgb2312) {
  1373. if (inRange(bite, 0x20, 0x7F)) {
  1374. hzgb2312_lead = bite;
  1375. return null;
  1376. }
  1377. if (bite === 0x0A) {
  1378. hzgb2312 = false;
  1379. }
  1380. return decoderError(fatal);
  1381. }
  1382. if (inRange(bite, 0x00, 0x7F)) {
  1383. return bite;
  1384. }
  1385. return decoderError(fatal);
  1386. };
  1387. }
  1388. /**
  1389. * @constructor
  1390. * @param {{fatal: boolean}} options
  1391. */
  1392. function HZGB2312Encoder(options) {
  1393. var fatal = options.fatal;
  1394. /** @type {boolean} */
  1395. var hzgb2312 = false;
  1396. /**
  1397. * @param {ByteOutputStream} output_byte_stream Output byte stream.
  1398. * @param {CodePointInputStream} code_point_pointer Input stream.
  1399. * @return {number} The last byte emitted.
  1400. */
  1401. this.encode = function(output_byte_stream, code_point_pointer) {
  1402. var code_point = code_point_pointer.get();
  1403. if (code_point === EOF_code_point) {
  1404. return EOF_byte;
  1405. }
  1406. code_point_pointer.offset(1);
  1407. if (inRange(code_point, 0x0000, 0x007F) && hzgb2312) {
  1408. code_point_pointer.offset(-1);
  1409. hzgb2312 = false;
  1410. return output_byte_stream.emit(0x7E, 0x7D);
  1411. }
  1412. if (code_point === 0x007E) {
  1413. return output_byte_stream.emit(0x7E, 0x7E);
  1414. }
  1415. if (inRange(code_point, 0x0000, 0x007F)) {
  1416. return output_byte_stream.emit(code_point);
  1417. }
  1418. if (!hzgb2312) {
  1419. code_point_pointer.offset(-1);
  1420. hzgb2312 = true;
  1421. return output_byte_stream.emit(0x7E, 0x7B);
  1422. }
  1423. var pointer = indexPointerFor(code_point, indexes['gbk']);
  1424. if (pointer === null) {
  1425. return encoderError(code_point);
  1426. }
  1427. var lead = div(pointer, 190) + 1;
  1428. var trail = pointer % 190 - 0x3F;
  1429. if (!inRange(lead, 0x21, 0x7E) || !inRange(trail, 0x21, 0x7E)) {
  1430. return encoderError(code_point);
  1431. }
  1432. return output_byte_stream.emit(lead, trail);
  1433. };
  1434. }
  1435. /** @param {{fatal: boolean}} options */
  1436. name_to_encoding['hz-gb-2312'].getEncoder = function(options) {
  1437. return new HZGB2312Encoder(options);
  1438. };
  1439. /** @param {{fatal: boolean}} options */
  1440. name_to_encoding['hz-gb-2312'].getDecoder = function(options) {
  1441. return new HZGB2312Decoder(options);
  1442. };
  1443. //
  1444. // 11. Legacy multi-byte Chinese (traditional) encodings
  1445. //
  1446. // 11.1 big5
  1447. /**
  1448. * @constructor
  1449. * @param {{fatal: boolean}} options
  1450. */
  1451. function Big5Decoder(options) {
  1452. var fatal = options.fatal;
  1453. var /** @type {number} */ big5_lead = 0x00,
  1454. /** @type {?number} */ big5_pending = null;
  1455. /**
  1456. * @param {ByteInputStream} byte_pointer The byte steram to decode.
  1457. * @return {?number} The next code point decoded, or null if not enough
  1458. * data exists in the input stream to decode a complete code point.
  1459. */
  1460. this.decode = function(byte_pointer) {
  1461. // NOTE: Hack to support emitting two code points
  1462. if (big5_pending !== null) {
  1463. var pending = big5_pending;
  1464. big5_pending = null;
  1465. return pending;
  1466. }
  1467. var bite = byte_pointer.get();
  1468. if (bite === EOF_byte && big5_lead === 0x00) {
  1469. return EOF_code_point;
  1470. }
  1471. if (bite === EOF_byte && big5_lead !== 0x00) {
  1472. big5_lead = 0x00;
  1473. return decoderError(fatal);
  1474. }
  1475. byte_pointer.offset(1);
  1476. if (big5_lead !== 0x00) {
  1477. var lead = big5_lead;
  1478. var pointer = null;
  1479. big5_lead = 0x00;
  1480. var offset = bite < 0x7F ? 0x40 : 0x62;
  1481. if (inRange(bite, 0x40, 0x7E) || inRange(bite, 0xA1, 0xFE)) {
  1482. pointer = (lead - 0x81) * 157 + (bite - offset);
  1483. }
  1484. if (pointer === 1133) {
  1485. big5_pending = 0x0304;
  1486. return 0x00CA;
  1487. }
  1488. if (pointer === 1135) {
  1489. big5_pending = 0x030C;
  1490. return 0x00CA;
  1491. }
  1492. if (pointer === 1164) {
  1493. big5_pending = 0x0304;
  1494. return 0x00EA;
  1495. }
  1496. if (pointer === 1166) {
  1497. big5_pending = 0x030C;
  1498. return 0x00EA;
  1499. }
  1500. var code_point = (pointer === null) ? null :
  1501. indexCodePointFor(pointer, indexes['big5']);
  1502. if (pointer === null) {
  1503. byte_pointer.offset(-1);
  1504. }
  1505. if (code_point === null) {
  1506. return decoderError(fatal);
  1507. }
  1508. return code_point;
  1509. }
  1510. if (inRange(bite, 0x00, 0x7F)) {
  1511. return bite;
  1512. }
  1513. if (inRange(bite, 0x81, 0xFE)) {
  1514. big5_lead = bite;
  1515. return null;
  1516. }
  1517. return decoderError(fatal);
  1518. };
  1519. }
  1520. /**
  1521. * @constructor
  1522. * @param {{fatal: boolean}} options
  1523. */
  1524. function Big5Encoder(options) {
  1525. var fatal = options.fatal;
  1526. /**
  1527. * @param {ByteOutputStream} output_byte_stream Output byte stream.
  1528. * @param {CodePointInputStream} code_point_pointer Input stream.
  1529. * @return {number} The last byte emitted.
  1530. */
  1531. this.encode = function(output_byte_stream, code_point_pointer) {
  1532. var code_point = code_point_pointer.get();
  1533. if (code_point === EOF_code_point) {
  1534. return EOF_byte;
  1535. }
  1536. code_point_pointer.offset(1);
  1537. if (inRange(code_point, 0x0000, 0x007F)) {
  1538. return output_byte_stream.emit(code_point);
  1539. }
  1540. var pointer = indexPointerFor(code_point, indexes['big5']);
  1541. if (pointer === null) {
  1542. return encoderError(code_point);
  1543. }
  1544. var lead = div(pointer, 157) + 0x81;
  1545. //if (lead < 0xA1) {
  1546. // return encoderError(code_point);
  1547. //}
  1548. var trail = pointer % 157;
  1549. var offset = trail < 0x3F ? 0x40 : 0x62;
  1550. return output_byte_stream.emit(lead, trail + offset);
  1551. };
  1552. }
  1553. /** @param {{fatal: boolean}} options */
  1554. name_to_encoding['big5'].getEncoder = function(options) {
  1555. return new Big5Encoder(options);
  1556. };
  1557. /** @param {{fatal: boolean}} options */
  1558. name_to_encoding['big5'].getDecoder = function(options) {
  1559. return new Big5Decoder(options);
  1560. };
  1561. //
  1562. // 12. Legacy multi-byte Japanese encodings
  1563. //
  1564. // 12.1 euc.jp
  1565. /**
  1566. * @constructor
  1567. * @param {{fatal: boolean}} options
  1568. */
  1569. function EUCJPDecoder(options) {
  1570. var fatal = options.fatal;
  1571. var /** @type {number} */ eucjp_first = 0x00,
  1572. /** @type {number} */ eucjp_second = 0x00;
  1573. /**
  1574. * @param {ByteInputStream} byte_pointer The byte stream to decode.
  1575. * @return {?number} The next code point decoded, or null if not enough
  1576. * data exists in the input stream to decode a complete code point.
  1577. */
  1578. this.decode = function(byte_pointer) {
  1579. var bite = byte_pointer.get();
  1580. if (bite === EOF_byte) {
  1581. if (eucjp_first === 0x00 && eucjp_second === 0x00) {
  1582. return EOF_code_point;
  1583. }
  1584. eucjp_first = 0x00;
  1585. eucjp_second = 0x00;
  1586. return decoderError(fatal);
  1587. }
  1588. byte_pointer.offset(1);
  1589. var lead, code_point;
  1590. if (eucjp_second !== 0x00) {
  1591. lead = eucjp_second;
  1592. eucjp_second = 0x00;
  1593. code_point = null;
  1594. if (inRange(lead, 0xA1, 0xFE) && inRange(bite, 0xA1, 0xFE)) {
  1595. code_point = indexCodePointFor((lead - 0xA1) * 94 + bite - 0xA1,
  1596. indexes['jis0212']);
  1597. }
  1598. if (!inRange(bite, 0xA1, 0xFE)) {
  1599. byte_pointer.offset(-1);
  1600. }
  1601. if (code_point === null) {
  1602. return decoderError(fatal);
  1603. }
  1604. return code_point;
  1605. }
  1606. if (eucjp_first === 0x8E && inRange(bite, 0xA1, 0xDF)) {
  1607. eucjp_first = 0x00;
  1608. return 0xFF61 + bite - 0xA1;
  1609. }
  1610. if (eucjp_first === 0x8F && inRange(bite, 0xA1, 0xFE)) {
  1611. eucjp_first = 0x00;
  1612. eucjp_second = bite;
  1613. return null;
  1614. }
  1615. if (eucjp_first !== 0x00) {
  1616. lead = eucjp_first;
  1617. eucjp_first = 0x00;
  1618. code_point = null;
  1619. if (inRange(lead, 0xA1, 0xFE) && inRange(bite, 0xA1, 0xFE)) {
  1620. code_point = indexCodePointFor((lead - 0xA1) * 94 + bite - 0xA1,
  1621. indexes['jis0208']);
  1622. }
  1623. if (!inRange(bite, 0xA1, 0xFE)) {
  1624. byte_pointer.offset(-1);
  1625. }
  1626. if (code_point === null) {
  1627. return decoderError(fatal);
  1628. }
  1629. return code_point;
  1630. }
  1631. if (inRange(bite, 0x00, 0x7F)) {
  1632. return bite;
  1633. }
  1634. if (bite === 0x8E || bite === 0x8F || (inRange(bite, 0xA1, 0xFE))) {
  1635. eucjp_first = bite;
  1636. return null;
  1637. }
  1638. return decoderError(fatal);
  1639. };
  1640. }
  1641. /**
  1642. * @constructor
  1643. * @param {{fatal: boolean}} options
  1644. */
  1645. function EUCJPEncoder(options) {
  1646. var fatal = options.fatal;
  1647. /**
  1648. * @param {ByteOutputStream} output_byte_stream Output byte stream.
  1649. * @param {CodePointInputStream} code_point_pointer Input stream.
  1650. * @return {number} The last byte emitted.
  1651. */
  1652. this.encode = function(output_byte_stream, code_point_pointer) {
  1653. var code_point = code_point_pointer.get();
  1654. if (code_point === EOF_code_point) {
  1655. return EOF_byte;
  1656. }
  1657. code_point_pointer.offset(1);
  1658. if (inRange(code_point, 0x0000, 0x007F)) {
  1659. return output_byte_stream.emit(code_point);
  1660. }
  1661. if (code_point === 0x00A5) {
  1662. return output_byte_stream.emit(0x5C);
  1663. }
  1664. if (code_point === 0x203E) {
  1665. return output_byte_stream.emit(0x7E);
  1666. }
  1667. if (inRange(code_point, 0xFF61, 0xFF9F)) {
  1668. return output_byte_stream.emit(0x8E, code_point - 0xFF61 + 0xA1);
  1669. }
  1670. var pointer = indexPointerFor(code_point, indexes['jis0208']);
  1671. if (pointer === null) {
  1672. return encoderError(code_point);
  1673. }
  1674. var lead = div(pointer, 94) + 0xA1;
  1675. var trail = pointer % 94 + 0xA1;
  1676. return output_byte_stream.emit(lead, trail);
  1677. };
  1678. }
  1679. /** @param {{fatal: boolean}} options */
  1680. name_to_encoding['euc-jp'].getEncoder = function(options) {
  1681. return new EUCJPEncoder(options);
  1682. };
  1683. /** @param {{fatal: boolean}} options */
  1684. name_to_encoding['euc-jp'].getDecoder = function(options) {
  1685. return new EUCJPDecoder(options);
  1686. };
  1687. // 12.2 iso-2022-jp
  1688. /**
  1689. * @constructor
  1690. * @param {{fatal: boolean}} options
  1691. */
  1692. function ISO2022JPDecoder(options) {
  1693. var fatal = options.fatal;
  1694. /** @enum */
  1695. var state = {
  1696. ASCII: 0,
  1697. escape_start: 1,
  1698. escape_middle: 2,
  1699. escape_final: 3,
  1700. lead: 4,
  1701. trail: 5,
  1702. Katakana: 6
  1703. };
  1704. var /** @type {number} */ iso2022jp_state = state.ASCII,
  1705. /** @type {boolean} */ iso2022jp_jis0212 = false,
  1706. /** @type {number} */ iso2022jp_lead = 0x00;
  1707. /**
  1708. * @param {ByteInputStream} byte_pointer The byte stream to decode.
  1709. * @return {?number} The next code point decoded, or null if not enough
  1710. * data exists in the input stream to decode a complete code point.
  1711. */
  1712. this.decode = function(byte_pointer) {
  1713. var bite = byte_pointer.get();
  1714. if (bite !== EOF_byte) {
  1715. byte_pointer.offset(1);
  1716. }
  1717. switch (iso2022jp_state) {
  1718. default:
  1719. case state.ASCII:
  1720. if (bite === 0x1B) {
  1721. iso2022jp_state = state.escape_start;
  1722. return null;
  1723. }
  1724. if (inRange(bite, 0x00, 0x7F)) {
  1725. return bite;
  1726. }
  1727. if (bite === EOF_byte) {
  1728. return EOF_code_point;
  1729. }
  1730. return decoderError(fatal);
  1731. case state.escape_start:
  1732. if (bite === 0x24 || bite === 0x28) {
  1733. iso2022jp_lead = bite;
  1734. iso2022jp_state = state.escape_middle;
  1735. return null;
  1736. }
  1737. if (bite !== EOF_byte) {
  1738. byte_pointer.offset(-1);
  1739. }
  1740. iso2022jp_state = state.ASCII;
  1741. return decoderError(fatal);
  1742. case state.escape_middle:
  1743. var lead = iso2022jp_lead;
  1744. iso2022jp_lead = 0x00;
  1745. if (lead === 0x24 && (bite === 0x40 || bite === 0x42)) {
  1746. iso2022jp_jis0212 = false;
  1747. iso2022jp_state = state.lead;
  1748. return null;
  1749. }
  1750. if (lead === 0x24 && bite === 0x28) {
  1751. iso2022jp_state = state.escape_final;
  1752. return null;
  1753. }
  1754. if (lead === 0x28 && (bite === 0x42 || bite === 0x4A)) {
  1755. iso2022jp_state = state.ASCII;
  1756. return null;
  1757. }
  1758. if (lead === 0x28 && bite === 0x49) {
  1759. iso2022jp_state = state.Katakana;
  1760. return null;
  1761. }
  1762. if (bite === EOF_byte) {
  1763. byte_pointer.offset(-1);
  1764. } else {
  1765. byte_pointer.offset(-2);
  1766. }
  1767. iso2022jp_state = state.ASCII;
  1768. return decoderError(fatal);
  1769. case state.escape_final:
  1770. if (bite === 0x44) {
  1771. iso2022jp_jis0212 = true;
  1772. iso2022jp_state = state.lead;
  1773. return null;
  1774. }
  1775. if (bite === EOF_byte) {
  1776. byte_pointer.offset(-2);
  1777. } else {
  1778. byte_pointer.offset(-3);
  1779. }
  1780. iso2022jp_state = state.ASCII;
  1781. return decoderError(fatal);
  1782. case state.lead:
  1783. if (bite === 0x0A) {
  1784. iso2022jp_state = state.ASCII;
  1785. return decoderError(fatal, 0x000A);
  1786. }
  1787. if (bite === 0x1B) {
  1788. iso2022jp_state = state.escape_start;
  1789. return null;
  1790. }
  1791. if (bite === EOF_byte) {
  1792. return EOF_code_point;
  1793. }
  1794. iso2022jp_lead = bite;
  1795. iso2022jp_state = state.trail;
  1796. return null;
  1797. case state.trail:
  1798. iso2022jp_state = state.lead;
  1799. if (bite === EOF_byte) {
  1800. return decoderError(fatal);
  1801. }
  1802. var code_point = null;
  1803. var pointer = (iso2022jp_lead - 0x21) * 94 + bite - 0x21;
  1804. if (inRange(iso2022jp_lead, 0x21, 0x7E) &&
  1805. inRange(bite, 0x21, 0x7E)) {
  1806. code_point = (iso2022jp_jis0212 === false) ?
  1807. indexCodePointFor(pointer, indexes['jis0208']) :
  1808. indexCodePointFor(pointer, indexes['jis0212']);
  1809. }
  1810. if (code_point === null) {
  1811. return decoderError(fatal);
  1812. }
  1813. return code_point;
  1814. case state.Katakana:
  1815. if (bite === 0x1B) {
  1816. iso2022jp_state = state.escape_start;
  1817. return null;
  1818. }
  1819. if (inRange(bite, 0x21, 0x5F)) {
  1820. return 0xFF61 + bite - 0x21;
  1821. }
  1822. if (bite === EOF_byte) {
  1823. return EOF_code_point;
  1824. }
  1825. return decoderError(fatal);
  1826. }
  1827. };
  1828. }
  1829. /**
  1830. * @constructor
  1831. * @param {{fatal: boolean}} options
  1832. */
  1833. function ISO2022JPEncoder(options) {
  1834. var fatal = options.fatal;
  1835. /** @enum */
  1836. var state = {
  1837. ASCII: 0,
  1838. lead: 1,
  1839. Katakana: 2
  1840. };
  1841. var /** @type {number} */ iso2022jp_state = state.ASCII;
  1842. /**
  1843. * @param {ByteOutputStream} output_byte_stream Output byte stream.
  1844. * @param {CodePointInputStream} code_point_pointer Input stream.
  1845. * @return {number} The last byte emitted.
  1846. */
  1847. this.encode = function(output_byte_stream, code_point_pointer) {
  1848. var code_point = code_point_pointer.get();
  1849. if (code_point === EOF_code_point) {
  1850. return EOF_byte;
  1851. }
  1852. code_point_pointer.offset(1);
  1853. if ((inRange(code_point, 0x0000, 0x007F) ||
  1854. code_point === 0x00A5 || code_point === 0x203E) &&
  1855. iso2022jp_state !== state.ASCII) {
  1856. code_point_pointer.offset(-1);
  1857. iso2022jp_state = state.ASCII;
  1858. return output_byte_stream.emit(0x1B, 0x28, 0x42);
  1859. }
  1860. if (inRange(code_point, 0x0000, 0x007F)) {
  1861. return output_byte_stream.emit(code_point);
  1862. }
  1863. if (code_point === 0x00A5) {
  1864. return output_byte_stream.emit(0x5C);
  1865. }
  1866. if (code_point === 0x203E) {
  1867. return output_byte_stream.emit(0x7E);
  1868. }
  1869. if (inRange(code_point, 0xFF61, 0xFF9F) &&
  1870. iso2022jp_state !== state.Katakana) {
  1871. code_point_pointer.offset(-1);
  1872. iso2022jp_state = state.Katakana;
  1873. return output_byte_stream.emit(0x1B, 0x28, 0x49);
  1874. }
  1875. if (inRange(code_point, 0xFF61, 0xFF9F)) {
  1876. return output_byte_stream.emit(code_point - 0xFF61 - 0x21);
  1877. }
  1878. if (iso2022jp_state !== state.lead) {
  1879. code_point_pointer.offset(-1);
  1880. iso2022jp_state = state.lead;
  1881. return output_byte_stream.emit(0x1B, 0x24, 0x42);
  1882. }
  1883. var pointer = indexPointerFor(code_point, indexes['jis0208']);
  1884. if (pointer === null) {
  1885. return encoderError(code_point);
  1886. }
  1887. var lead = div(pointer, 94) + 0x21;
  1888. var trail = pointer % 94 + 0x21;
  1889. return output_byte_stream.emit(lead, trail);
  1890. };
  1891. }
  1892. /** @param {{fatal: boolean}} options */
  1893. name_to_encoding['iso-2022-jp'].getEncoder = function(options) {
  1894. return new ISO2022JPEncoder(options);
  1895. };
  1896. /** @param {{fatal: boolean}} options */
  1897. name_to_encoding['iso-2022-jp'].getDecoder = function(options) {
  1898. return new ISO2022JPDecoder(options);
  1899. };
  1900. // 12.3 shift_jis
  1901. /**
  1902. * @constructor
  1903. * @param {{fatal: boolean}} options
  1904. */
  1905. function ShiftJISDecoder(options) {
  1906. var fatal = options.fatal;
  1907. var /** @type {number} */ shiftjis_lead = 0x00;
  1908. /**
  1909. * @param {ByteInputStream} byte_pointer The byte stream to decode.
  1910. * @return {?number} The next code point decoded, or null if not enough
  1911. * data exists in the input stream to decode a complete code point.
  1912. */
  1913. this.decode = function(byte_pointer) {
  1914. var bite = byte_pointer.get();
  1915. if (bite === EOF_byte && shiftjis_lead === 0x00) {
  1916. return EOF_code_point;
  1917. }
  1918. if (bite === EOF_byte && shiftjis_lead !== 0x00) {
  1919. shiftjis_lead = 0x00;
  1920. return decoderError(fatal);
  1921. }
  1922. byte_pointer.offset(1);
  1923. if (shiftjis_lead !== 0x00) {
  1924. var lead = shiftjis_lead;
  1925. shiftjis_lead = 0x00;
  1926. if (inRange(bite, 0x40, 0x7E) || inRange(bite, 0x80, 0xFC)) {
  1927. var offset = (bite < 0x7F) ? 0x40 : 0x41;
  1928. var lead_offset = (lead < 0xA0) ? 0x81 : 0xC1;
  1929. var code_point = indexCodePointFor((lead - lead_offset) * 188 +
  1930. bite - offset, indexes['jis0208']);
  1931. if (code_point === null) {
  1932. return decoderError(fatal);
  1933. }
  1934. return code_point;
  1935. }
  1936. byte_pointer.offset(-1);
  1937. return decoderError(fatal);
  1938. }
  1939. if (inRange(bite, 0x00, 0x80)) {
  1940. return bite;
  1941. }
  1942. if (inRange(bite, 0xA1, 0xDF)) {
  1943. return 0xFF61 + bite - 0xA1;
  1944. }
  1945. if (inRange(bite, 0x81, 0x9F) || inRange(bite, 0xE0, 0xFC)) {
  1946. shiftjis_lead = bite;
  1947. return null;
  1948. }
  1949. return decoderError(fatal);
  1950. };
  1951. }
  1952. /**
  1953. * @constructor
  1954. * @param {{fatal: boolean}} options
  1955. */
  1956. function ShiftJISEncoder(options) {
  1957. var fatal = options.fatal;
  1958. /**
  1959. * @param {ByteOutputStream} output_byte_stream Output byte stream.
  1960. * @param {CodePointInputStream} code_point_pointer Input stream.
  1961. * @return {number} The last byte emitted.
  1962. */
  1963. this.encode = function(output_byte_stream, code_point_pointer) {
  1964. var code_point = code_point_pointer.get();
  1965. if (code_point === EOF_code_point) {
  1966. return EOF_byte;
  1967. }
  1968. code_point_pointer.offset(1);
  1969. if (inRange(code_point, 0x0000, 0x0080)) {
  1970. return output_byte_stream.emit(code_point);
  1971. }
  1972. if (code_point === 0x00A5) {
  1973. return output_byte_stream.emit(0x5C);
  1974. }
  1975. if (code_point === 0x203E) {
  1976. return output_byte_stream.emit(0x7E);
  1977. }
  1978. if (inRange(code_point, 0xFF61, 0xFF9F)) {
  1979. return output_byte_stream.emit(code_point - 0xFF61 + 0xA1);
  1980. }
  1981. var pointer = indexPointerFor(code_point, indexes['jis0208']);
  1982. if (pointer === null) {
  1983. return encoderError(code_point);
  1984. }
  1985. var lead = div(pointer, 188);
  1986. var lead_offset = lead < 0x1F ? 0x81 : 0xC1;
  1987. var trail = pointer % 188;
  1988. var offset = trail < 0x3F ? 0x40 : 0x41;
  1989. return output_byte_stream.emit(lead + lead_offset, trail + offset);
  1990. };
  1991. }
  1992. /** @param {{fatal: boolean}} options */
  1993. name_to_encoding['shift_jis'].getEncoder = function(options) {
  1994. return new ShiftJISEncoder(options);
  1995. };
  1996. /** @param {{fatal: boolean}} options */
  1997. name_to_encoding['shift_jis'].getDecoder = function(options) {
  1998. return new ShiftJISDecoder(options);
  1999. };
  2000. //
  2001. // 13. Legacy multi-byte Korean encodings
  2002. //
  2003. // 13.1 euc-kr
  2004. /**
  2005. * @constructor
  2006. * @param {{fatal: boolean}} options
  2007. */
  2008. function EUCKRDecoder(options) {
  2009. var fatal = options.fatal;
  2010. var /** @type {number} */ euckr_lead = 0x00;
  2011. /**
  2012. * @param {ByteInputStream} byte_pointer The byte stream to decode.
  2013. * @return {?number} The next code point decoded, or null if not enough
  2014. * data exists in the input stream to decode a complete code point.
  2015. */
  2016. this.decode = function(byte_pointer) {
  2017. var bite = byte_pointer.get();
  2018. if (bite === EOF_byte && euckr_lead === 0) {
  2019. return EOF_code_point;
  2020. }
  2021. if (bite === EOF_byte && euckr_lead !== 0) {
  2022. euckr_lead = 0x00;
  2023. return decoderError(fatal);
  2024. }
  2025. byte_pointer.offset(1);
  2026. if (euckr_lead !== 0x00) {
  2027. var lead = euckr_lead;
  2028. var pointer = null;
  2029. euckr_lead = 0x00;
  2030. if (inRange(lead, 0x81, 0xC6)) {
  2031. var temp = (26 + 26 + 126) * (lead - 0x81);
  2032. if (inRange(bite, 0x41, 0x5A)) {
  2033. pointer = temp + bite - 0x41;
  2034. } else if (inRange(bite, 0x61, 0x7A)) {
  2035. pointer = temp + 26 + bite - 0x61;
  2036. } else if (inRange(bite, 0x81, 0xFE)) {
  2037. pointer = temp + 26 + 26 + bite - 0x81;
  2038. }
  2039. }
  2040. if (inRange(lead, 0xC7, 0xFD) && inRange(bite, 0xA1, 0xFE)) {
  2041. pointer = (26 + 26 + 126) * (0xC7 - 0x81) + (lead - 0xC7) * 94 +
  2042. (bite - 0xA1);
  2043. }
  2044. var code_point = (pointer === null) ? null :
  2045. indexCodePointFor(pointer, indexes['euc-kr']);
  2046. if (pointer === null) {
  2047. byte_pointer.offset(-1);
  2048. }
  2049. if (code_point === null) {
  2050. return decoderError(fatal);
  2051. }
  2052. return code_point;
  2053. }
  2054. if (inRange(bite, 0x00, 0x7F)) {
  2055. return bite;
  2056. }
  2057. if (inRange(bite, 0x81, 0xFD)) {
  2058. euckr_lead = bite;
  2059. return null;
  2060. }
  2061. return decoderError(fatal);
  2062. };
  2063. }
  2064. /**
  2065. * @constructor
  2066. * @param {{fatal: boolean}} options
  2067. */
  2068. function EUCKREncoder(options) {
  2069. var fatal = options.fatal;
  2070. /**
  2071. * @param {ByteOutputStream} output_byte_stream Output byte stream.
  2072. * @param {CodePointInputStream} code_point_pointer Input stream.
  2073. * @return {number} The last byte emitted.
  2074. */
  2075. this.encode = function(output_byte_stream, code_point_pointer) {
  2076. var code_point = code_point_pointer.get();
  2077. if (code_point === EOF_code_point) {
  2078. return EOF_byte;
  2079. }
  2080. code_point_pointer.offset(1);
  2081. if (inRange(code_point, 0x0000, 0x007F)) {
  2082. return output_byte_stream.emit(code_point);
  2083. }
  2084. var pointer = indexPointerFor(code_point, indexes['euc-kr']);
  2085. if (pointer === null) {
  2086. return encoderError(code_point);
  2087. }
  2088. var lead, trail;
  2089. if (pointer < ((26 + 26 + 126) * (0xC7 - 0x81))) {
  2090. lead = div(pointer, (26 + 26 + 126)) + 0x81;
  2091. trail = pointer % (26 + 26 + 126);
  2092. var offset = trail < 26 ? 0x41 : trail < 26 + 26 ? 0x47 : 0x4D;
  2093. return output_byte_stream.emit(lead, trail + offset);
  2094. }
  2095. pointer = pointer - (26 + 26 + 126) * (0xC7 - 0x81);
  2096. lead = div(pointer, 94) + 0xC7;
  2097. trail = pointer % 94 + 0xA1;
  2098. return output_byte_stream.emit(lead, trail);
  2099. };
  2100. }
  2101. /** @param {{fatal: boolean}} options */
  2102. name_to_encoding['euc-kr'].getEncoder = function(options) {
  2103. return new EUCKREncoder(options);
  2104. };
  2105. /** @param {{fatal: boolean}} options */
  2106. name_to_encoding['euc-kr'].getDecoder = function(options) {
  2107. return new EUCKRDecoder(options);
  2108. };
  2109. //
  2110. // 14. Legacy miscellaneous encodings
  2111. //
  2112. // 14.1 replacement
  2113. // Not needed - API throws TypeError
  2114. // 14.2 utf-16
  2115. /**
  2116. * @constructor
  2117. * @param {boolean} utf16_be True if big-endian, false if little-endian.
  2118. * @param {{fatal: boolean}} options
  2119. */
  2120. function UTF16Decoder(utf16_be, options) {
  2121. var fatal = options.fatal;
  2122. var /** @type {?number} */ utf16_lead_byte = null,
  2123. /** @type {?number} */ utf16_lead_surrogate = null;
  2124. /**
  2125. * @param {ByteInputStream} byte_pointer The byte stream to decode.
  2126. * @return {?number} The next code point decoded, or null if not enough
  2127. * data exists in the input stream to decode a complete code point.
  2128. */
  2129. this.decode = function(byte_pointer) {
  2130. var bite = byte_pointer.get();
  2131. if (bite === EOF_byte && utf16_lead_byte === null &&
  2132. utf16_lead_surrogate === null) {
  2133. return EOF_code_point;
  2134. }
  2135. if (bite === EOF_byte && (utf16_lead_byte !== null ||
  2136. utf16_lead_surrogate !== null)) {
  2137. return decoderError(fatal);
  2138. }
  2139. byte_pointer.offset(1);
  2140. if (utf16_lead_byte === null) {
  2141. utf16_lead_byte = bite;
  2142. return null;
  2143. }
  2144. var code_point;
  2145. if (utf16_be) {
  2146. code_point = (utf16_lead_byte << 8) + bite;
  2147. } else {
  2148. code_point = (bite << 8) + utf16_lead_byte;
  2149. }
  2150. utf16_lead_byte = null;
  2151. if (utf16_lead_surrogate !== null) {
  2152. var lead_surrogate = utf16_lead_surrogate;
  2153. utf16_lead_surrogate = null;
  2154. if (inRange(code_point, 0xDC00, 0xDFFF)) {
  2155. return 0x10000 + (lead_surrogate - 0xD800) * 0x400 +
  2156. (code_point - 0xDC00);
  2157. }
  2158. byte_pointer.offset(-2);
  2159. return decoderError(fatal);
  2160. }
  2161. if (inRange(code_point, 0xD800, 0xDBFF)) {
  2162. utf16_lead_surrogate = code_point;
  2163. return null;
  2164. }
  2165. if (inRange(code_point, 0xDC00, 0xDFFF)) {
  2166. return decoderError(fatal);
  2167. }
  2168. return code_point;
  2169. };
  2170. }
  2171. /**
  2172. * @constructor
  2173. * @param {boolean} utf16_be True if big-endian, false if little-endian.
  2174. * @param {{fatal: boolean}} options
  2175. */
  2176. function UTF16Encoder(utf16_be, options) {
  2177. var fatal = options.fatal;
  2178. /**
  2179. * @param {ByteOutputStream} output_byte_stream Output byte stream.
  2180. * @param {CodePointInputStream} code_point_pointer Input stream.
  2181. * @return {number} The last byte emitted.
  2182. */
  2183. this.encode = function(output_byte_stream, code_point_pointer) {
  2184. /**
  2185. * @param {number} code_unit
  2186. * @return {number} last byte emitted
  2187. */
  2188. function convert_to_bytes(code_unit) {
  2189. var byte1 = code_unit >> 8;
  2190. var byte2 = code_unit & 0x00FF;
  2191. if (utf16_be) {
  2192. return output_byte_stream.emit(byte1, byte2);
  2193. }
  2194. return output_byte_stream.emit(byte2, byte1);
  2195. }
  2196. var code_point = code_point_pointer.get();
  2197. if (code_point === EOF_code_point) {
  2198. return EOF_byte;
  2199. }
  2200. code_point_pointer.offset(1);
  2201. if (inRange(code_point, 0xD800, 0xDFFF)) {
  2202. encoderError(code_point);
  2203. }
  2204. if (code_point <= 0xFFFF) {
  2205. return convert_to_bytes(code_point);
  2206. }
  2207. var lead = div((code_point - 0x10000), 0x400) + 0xD800;
  2208. var trail = ((code_point - 0x10000) % 0x400) + 0xDC00;
  2209. convert_to_bytes(lead);
  2210. return convert_to_bytes(trail);
  2211. };
  2212. }
  2213. // 14.3 utf-16be
  2214. /** @param {{fatal: boolean}} options */
  2215. name_to_encoding['utf-16be'].getEncoder = function(options) {
  2216. return new UTF16Encoder(true, options);
  2217. };
  2218. /** @param {{fatal: boolean}} options */
  2219. name_to_encoding['utf-16be'].getDecoder = function(options) {
  2220. return new UTF16Decoder(true, options);
  2221. };
  2222. // 14.4 utf-16le
  2223. /** @param {{fatal: boolean}} options */
  2224. name_to_encoding['utf-16le'].getEncoder = function(options) {
  2225. return new UTF16Encoder(false, options);
  2226. };
  2227. /** @param {{fatal: boolean}} options */
  2228. name_to_encoding['utf-16le'].getDecoder = function(options) {
  2229. return new UTF16Decoder(false, options);
  2230. };
  2231. // 14.5 x-user-defined
  2232. // TODO: Implement this encoding.
  2233. // NOTE: currently unused
  2234. /**
  2235. * @param {string} label The encoding label.
  2236. * @param {ByteInputStream} input_stream The byte stream to test.
  2237. */
  2238. function detectEncoding(label, input_stream) {
  2239. if (input_stream.match([0xFF, 0xFE])) {
  2240. input_stream.offset(2);
  2241. return 'utf-16le';
  2242. }
  2243. if (input_stream.match([0xFE, 0xFF])) {
  2244. input_stream.offset(2);
  2245. return 'utf-16be';
  2246. }
  2247. if (input_stream.match([0xEF, 0xBB, 0xBF])) {
  2248. input_stream.offset(3);
  2249. return 'utf-8';
  2250. }
  2251. return label;
  2252. }
  2253. exports.TextEncoder = TextEncoder;
  2254. exports.TextDecoder = TextDecoder;
  2255. exports.encodingExists = getEncoding;