6/18/ds/18js/lib/utf16.js

139 lines
4.6 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { inRange, decoderError, end_of_stream, finished, convertCodeUnitToBytes } from './text_decoder_utils.js'
// 15.2.1 shared utf-16 decoder
/**
* @implements {Decoder}
*/
export class UTF16Decoder {
/**
* @param {boolean} utf16_be True if big-endian, false if little-endian.
* @param {{fatal: boolean}} options
*/
constructor(utf16_be, options) {
const { fatal } = options
this.utf16_be = utf16_be
this.fatal = fatal
this.utf16_lead_byte = null
this.utf16_lead_surrogate = null
}
/**
* @param {Stream} stream The stream of bytes being decoded.
* @param {number} bite The next byte read from the stream.
*/
handler(stream, bite) {
// 1. If byte is end-of-stream and either utf-16 lead byte or
// utf-16 lead surrogate is not null, set utf-16 lead byte and
// utf-16 lead surrogate to null, and return error.
if (bite === end_of_stream && (this.utf16_lead_byte !== null ||
this.utf16_lead_surrogate !== null)) {
return decoderError(this.fatal)
}
// 2. If byte is end-of-stream and utf-16 lead byte and utf-16
// lead surrogate are null, return finished.
if (bite === end_of_stream && this.utf16_lead_byte === null &&
this.utf16_lead_surrogate === null) {
return finished
}
// 3. If utf-16 lead byte is null, set utf-16 lead byte to byte
// and return continue.
if (this.utf16_lead_byte === null) {
this.utf16_lead_byte = bite
return null
}
// 4. Let code unit be the result of:
let code_unit
if (this.utf16_be) {
// utf-16be decoder flag is set
// (utf-16 lead byte << 8) + byte.
code_unit = (this.utf16_lead_byte << 8) + bite
} else {
// utf-16be decoder flag is unset
// (byte << 8) + utf-16 lead byte.
code_unit = (bite << 8) + this.utf16_lead_byte
}
// Then set utf-16 lead byte to null.
this.utf16_lead_byte = null
// 5. If utf-16 lead surrogate is not null, let lead surrogate
// be utf-16 lead surrogate, set utf-16 lead surrogate to null,
// and then run these substeps:
if (this.utf16_lead_surrogate !== null) {
const lead_surrogate = this.utf16_lead_surrogate
this.utf16_lead_surrogate = null
// 1. If code unit is in the range U+DC00 to U+DFFF,
// inclusive, return a code point whose value is 0x10000 +
// ((lead surrogate 0xD800) << 10) + (code unit 0xDC00).
if (inRange(code_unit, 0xDC00, 0xDFFF)) {
return 0x10000 + (lead_surrogate - 0xD800) * 0x400 +
(code_unit - 0xDC00)
}
// 2. Prepend the sequence resulting of converting code unit
// to bytes using utf-16be decoder flag to stream and return
// error.
stream.prepend(convertCodeUnitToBytes(code_unit, this.utf16_be))
return decoderError(this.fatal)
}
// 6. If code unit is in the range U+D800 to U+DBFF, inclusive,
// set utf-16 lead surrogate to code unit and return continue.
if (inRange(code_unit, 0xD800, 0xDBFF)) {
this.utf16_lead_surrogate = code_unit
return null
}
// 7. If code unit is in the range U+DC00 to U+DFFF, inclusive,
// return error.
if (inRange(code_unit, 0xDC00, 0xDFFF))
return decoderError(this.fatal)
// 8. Return code point code unit.
return code_unit
}
}
// 15.2.2 shared utf-16 encoder
/**
* @implements {Encoder}
*/
export class UTF16Encoder {
/**
* @param {boolean} [utf16_be] True if big-endian, false if little-endian.
*/
constructor(utf16_be = false) {
this.utf16_be = utf16_be
}
/**
* @param {Stream} stream Input stream.
* @param {number} code_point Next code point read from the stream.
*/
handler(stream, code_point) {
// 1. If code point is end-of-stream, return finished.
if (code_point === end_of_stream)
return finished
// 2. If code point is in the range U+0000 to U+FFFF, inclusive,
// return the sequence resulting of converting code point to
// bytes using utf-16be encoder flag.
if (inRange(code_point, 0x0000, 0xFFFF))
return convertCodeUnitToBytes(code_point, this.utf16_be)
// 3. Let lead be ((code point 0x10000) >> 10) + 0xD800,
// converted to bytes using utf-16be encoder flag.
const lead = convertCodeUnitToBytes(
((code_point - 0x10000) >> 10) + 0xD800, this.utf16_be)
// 4. Let trail be ((code point 0x10000) & 0x3FF) + 0xDC00,
// converted to bytes using utf-16be encoder flag.
const trail = convertCodeUnitToBytes(
((code_point - 0x10000) & 0x3FF) + 0xDC00, this.utf16_be)
// 5. Return a byte sequence of lead followed by trail.
return lead.concat(trail)
}
}