utf16.js
4.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
"use strict";
var Buffer = require("buffer").Buffer;
// Note: UTF16-LE (or UCS2) codec is Node.js native. See encodings/internal.js
// == UTF16-BE codec. ==========================================================
exports.utf16be = Utf16BECodec;
function Utf16BECodec() {
}
Utf16BECodec.prototype.encoder = Utf16BEEncoder;
Utf16BECodec.prototype.decoder = Utf16BEDecoder;
Utf16BECodec.prototype.bomAware = true;
// -- Encoding
function Utf16BEEncoder() {
}
Utf16BEEncoder.prototype.write = function(str) {
var buf = new Buffer(str, 'ucs2');
for (var i = 0; i < buf.length; i += 2) {
var tmp = buf[i]; buf[i] = buf[i+1]; buf[i+1] = tmp;
}
return buf;
}
Utf16BEEncoder.prototype.end = function() {
}
// -- Decoding
function Utf16BEDecoder() {
this.overflowByte = -1;
}
Utf16BEDecoder.prototype.write = function(buf) {
if (buf.length == 0)
return '';
var buf2 = new Buffer(buf.length + 1),
i = 0, j = 0;
if (this.overflowByte !== -1) {
buf2[0] = buf[0];
buf2[1] = this.overflowByte;
i = 1; j = 2;
}
for (; i < buf.length-1; i += 2, j+= 2) {
buf2[j] = buf[i+1];
buf2[j+1] = buf[i];
}
this.overflowByte = (i == buf.length-1) ? buf[buf.length-1] : -1;
return buf2.slice(0, j).toString('ucs2');
}
Utf16BEDecoder.prototype.end = function() {
}
// == UTF-16 codec =============================================================
// Decoder chooses automatically from UTF-16LE and UTF-16BE using BOM and space-based heuristic.
// Defaults to UTF-16LE, as it's prevalent and default in Node.
// http://en.wikipedia.org/wiki/UTF-16 and http://encoding.spec.whatwg.org/#utf-16le
// Decoder default can be changed: iconv.decode(buf, 'utf16', {defaultEncoding: 'utf-16be'});
// Encoder uses UTF-16LE and prepends BOM (which can be overridden with addBOM: false).
exports.utf16 = Utf16Codec;
function Utf16Codec(codecOptions, iconv) {
this.iconv = iconv;
}
Utf16Codec.prototype.encoder = Utf16Encoder;
Utf16Codec.prototype.decoder = Utf16Decoder;
// -- Encoding (pass-through)
function Utf16Encoder(options, codec) {
options = options || {};
if (options.addBOM === undefined)
options.addBOM = true;
this.encoder = codec.iconv.getEncoder('utf-16le', options);
}
Utf16Encoder.prototype.write = function(str) {
return this.encoder.write(str);
}
Utf16Encoder.prototype.end = function() {
return this.encoder.end();
}
// -- Decoding
function Utf16Decoder(options, codec) {
this.decoder = null;
this.initialBytes = [];
this.initialBytesLen = 0;
this.options = options || {};
this.iconv = codec.iconv;
}
Utf16Decoder.prototype.write = function(buf) {
if (!this.decoder) {
// Codec is not chosen yet. Accumulate initial bytes.
this.initialBytes.push(buf);
this.initialBytesLen += buf.length;
if (this.initialBytesLen < 16) // We need more bytes to use space heuristic (see below)
return '';
// We have enough bytes -> detect endianness.
var buf = Buffer.concat(this.initialBytes),
encoding = detectEncoding(buf, this.options.defaultEncoding);
this.decoder = this.iconv.getDecoder(encoding, this.options);
this.initialBytes.length = this.initialBytesLen = 0;
}
return this.decoder.write(buf);
}
Utf16Decoder.prototype.end = function() {
if (!this.decoder) {
var buf = Buffer.concat(this.initialBytes),
encoding = detectEncoding(buf, this.options.defaultEncoding);
this.decoder = this.iconv.getDecoder(encoding, this.options);
var res = this.decoder.write(buf),
trail = this.decoder.end();
return trail ? (res + trail) : res;
}
return this.decoder.end();
}
function detectEncoding(buf, defaultEncoding) {
var enc = defaultEncoding || 'utf-16le';
if (buf.length >= 2) {
// Check BOM.
if (buf[0] == 0xFE && buf[1] == 0xFF) // UTF-16BE BOM
enc = 'utf-16be';
else if (buf[0] == 0xFF && buf[1] == 0xFE) // UTF-16LE BOM
enc = 'utf-16le';
else {
// No BOM found. Try to deduce encoding from initial content.
// Most of the time, the content has ASCII chars (U+00**), but the opposite (U+**00) is uncommon.
// So, we count ASCII as if it was LE or BE, and decide from that.
var asciiCharsLE = 0, asciiCharsBE = 0, // Counts of chars in both positions
_len = Math.min(buf.length - (buf.length % 2), 64); // Len is always even.
for (var i = 0; i < _len; i += 2) {
if (buf[i] === 0 && buf[i+1] !== 0) asciiCharsBE++;
if (buf[i] !== 0 && buf[i+1] === 0) asciiCharsLE++;
}
if (asciiCharsBE > asciiCharsLE)
enc = 'utf-16be';
else if (asciiCharsBE < asciiCharsLE)
enc = 'utf-16le';
}
}
return enc;
}