encoding.test.js
2.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
/*jshint expr:true */
'use strict';
const Crawler = require('../lib/crawler');
const expect = require('chai').expect;
const nock = require('nock');
describe('Encoding', function() {
before(function() {
nock.cleanAll();
});
const origin = 'http://czyborra.com';
const encodingFileName = 'iso8859.html';
const charsetName = 'ISO-8859-1';
const path = `/charsets/${encodingFileName}`;
const url = `${origin}${path}`;
const pathWithoutCharsetHeader = `/charsets-noheader/${encodingFileName}`;
const urlWithoutCharsetHeader = `${origin}${pathWithoutCharsetHeader}`;
let crawler = null;
beforeEach(function() {
crawler = new Crawler({
retries: 0,
});
nock(origin).get(path).replyWithFile(200, `${__dirname}/${encodingFileName}`, { 'Content-Type': `text/html;charset=${charsetName}` });
nock(origin).get(pathWithoutCharsetHeader).replyWithFile(200, `${__dirname}/${encodingFileName}`, { 'Content-Type': 'text/html' });
});
it('should parse latin-1', function(done) {
crawler.queue([{
uri: url,
callback: function(error, result) {
expect(error).to.be.null;
expect(result.charset).to.eql(charsetName);
expect(result.body.indexOf('Jörg')).to.be.above(0);
done();
}
}]);
});
it('should return buffer if encoding = null', function(done) {
crawler.queue([{
uri: url,
encoding:null,
callback: function(error, result) {
expect(error).to.be.null;
expect(result.body instanceof Buffer).to.be.true;
done();
}
}]);
});
it('should parse latin-1 if incomingEncoding = ISO-8859-1', function(done) {
crawler.queue([{
uri: url,
incomingEncoding: charsetName,
callback: function(error, result) {
expect(error).to.be.null;
expect(result.charset).to.eql(charsetName);
expect(result.body.indexOf('Jörg')).to.be.above(0);
done();
}
}]);
});
it('could not parse latin-1 if incomingEncoding = gb2312', function(done) {
crawler.queue([{
uri: url,
incomingEncoding: 'gb2312',
callback: function(error, result) {
expect(error).to.be.null;
expect(result.body.indexOf('Jörg')).to.equal(-1);
done();
}
}]);
});
it('should parse charset from header ', function(done) {
crawler.queue([{
uri: url,
callback: function(error, result) {
expect(error).to.be.null;
expect(result.charset).to.equal(charsetName);
expect(result.body.indexOf('Jörg')).to.be.above(0);
done();
}
}]);
});
it('should parse charset from meta tag in html if header does not contain content-type key ', function(done) {
crawler.queue([{
uri: urlWithoutCharsetHeader,
callback: function(error, result) {
expect(error).to.be.null;
expect(result.charset).to.equal(charsetName);
expect(result.body.indexOf('Jörg')).to.be.above(0);
done();
}
}]);
});
});