utf16.js
5.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
"use strict";
var Buffer = require("safer-buffer").Buffer;
// Note: UTF16-LE (or UCS2) codec is Node.js native. See encodings/internal.js
// == UTF16-BE codec. ==========================================================
exports.utf16be = Utf16BECodec;
function Utf16BECodec() {
}
Utf16BECodec.prototype.encoder = Utf16BEEncoder;
Utf16BECodec.prototype.decoder = Utf16BEDecoder;
Utf16BECodec.prototype.bomAware = true;
// -- Encoding
function Utf16BEEncoder() {
}
Utf16BEEncoder.prototype.write = function(str) {
var buf = Buffer.from(str, 'ucs2');
for (var i = 0; i < buf.length; i += 2) {
var tmp = buf[i]; buf[i] = buf[i+1]; buf[i+1] = tmp;
}
return buf;
}
Utf16BEEncoder.prototype.end = function() {
}
// -- Decoding
function Utf16BEDecoder() {
this.overflowByte = -1;
}
Utf16BEDecoder.prototype.write = function(buf) {
if (buf.length == 0)
return '';
var buf2 = Buffer.alloc(buf.length + 1),
i = 0, j = 0;
if (this.overflowByte !== -1) {
buf2[0] = buf[0];
buf2[1] = this.overflowByte;
i = 1; j = 2;
}
for (; i < buf.length-1; i += 2, j+= 2) {
buf2[j] = buf[i+1];
buf2[j+1] = buf[i];
}
this.overflowByte = (i == buf.length-1) ? buf[buf.length-1] : -1;
return buf2.slice(0, j).toString('ucs2');
}
Utf16BEDecoder.prototype.end = function() {
this.overflowByte = -1;
}
// == UTF-16 codec =============================================================
// Decoder chooses automatically from UTF-16LE and UTF-16BE using BOM and space-based heuristic.
// Defaults to UTF-16LE, as it's prevalent and default in Node.
// http://en.wikipedia.org/wiki/UTF-16 and http://encoding.spec.whatwg.org/#utf-16le
// Decoder default can be changed: iconv.decode(buf, 'utf16', {defaultEncoding: 'utf-16be'});
// Encoder uses UTF-16LE and prepends BOM (which can be overridden with addBOM: false).
exports.utf16 = Utf16Codec;
function Utf16Codec(codecOptions, iconv) {
this.iconv = iconv;
}
Utf16Codec.prototype.encoder = Utf16Encoder;
Utf16Codec.prototype.decoder = Utf16Decoder;
// -- Encoding (pass-through)
function Utf16Encoder(options, codec) {
options = options || {};
if (options.addBOM === undefined)
options.addBOM = true;
this.encoder = codec.iconv.getEncoder('utf-16le', options);
}
Utf16Encoder.prototype.write = function(str) {
return this.encoder.write(str);
}
Utf16Encoder.prototype.end = function() {
return this.encoder.end();
}
// -- Decoding
function Utf16Decoder(options, codec) {
this.decoder = null;
this.initialBufs = [];
this.initialBufsLen = 0;
this.options = options || {};
this.iconv = codec.iconv;
}
Utf16Decoder.prototype.write = function(buf) {
if (!this.decoder) {
// Codec is not chosen yet. Accumulate initial bytes.
this.initialBufs.push(buf);
this.initialBufsLen += buf.length;
if (this.initialBufsLen < 16) // We need more bytes to use space heuristic (see below)
return '';
// We have enough bytes -> detect endianness.
var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding);
this.decoder = this.iconv.getDecoder(encoding, this.options);
var resStr = '';
for (var i = 0; i < this.initialBufs.length; i++)
resStr += this.decoder.write(this.initialBufs[i]);
this.initialBufs.length = this.initialBufsLen = 0;
return resStr;
}
return this.decoder.write(buf);
}
Utf16Decoder.prototype.end = function() {
if (!this.decoder) {
var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding);
this.decoder = this.iconv.getDecoder(encoding, this.options);
var resStr = '';
for (var i = 0; i < this.initialBufs.length; i++)
resStr += this.decoder.write(this.initialBufs[i]);
var trail = this.decoder.end();
if (trail)
resStr += trail;
this.initialBufs.length = this.initialBufsLen = 0;
return resStr;
}
return this.decoder.end();
}
function detectEncoding(bufs, defaultEncoding) {
var b = [];
var charsProcessed = 0;
var asciiCharsLE = 0, asciiCharsBE = 0; // Number of ASCII chars when decoded as LE or BE.
outer_loop:
for (var i = 0; i < bufs.length; i++) {
var buf = bufs[i];
for (var j = 0; j < buf.length; j++) {
b.push(buf[j]);
if (b.length === 2) {
if (charsProcessed === 0) {
// Check BOM first.
if (b[0] === 0xFF && b[1] === 0xFE) return 'utf-16le';
if (b[0] === 0xFE && b[1] === 0xFF) return 'utf-16be';
}
if (b[0] === 0 && b[1] !== 0) asciiCharsBE++;
if (b[0] !== 0 && b[1] === 0) asciiCharsLE++;
b.length = 0;
charsProcessed++;
if (charsProcessed >= 100) {
break outer_loop;
}
}
}
}
// Make decisions.
// Most of the time, the content has ASCII chars (U+00**), but the opposite (U+**00) is uncommon.
// So, we count ASCII as if it was LE or BE, and decide from that.
if (asciiCharsBE > asciiCharsLE) return 'utf-16be';
if (asciiCharsBE < asciiCharsLE) return 'utf-16le';
// Couldn't decide (likely all zeros or not enough data).
return defaultEncoding || 'utf-16le';
}