ElasticSearch is used to serve faceted queries, i.e. filter proxies by parameters like country, anonymity, supported websites, etc. It’s possible to create the same kind of functionality in plain Redis, but it’s kind of awkward (google ‘redis faceted search’).

Other posts in Gimmeproxy tech series:

Let’s see how to use ElasticSearch with Nodejs. At first we have to connect to ElasticSearch.

1
2
3
4
5
6
7
8
9
'use strict'
const elasticsearch = require('elasticsearch')
const _ = require('lodash')
const elasticClient = new elasticsearch.Client({
host: 'localhost:9200',
log: 'info'
})
const indexName = 'gimmeproxy'

Create or delete index, init mapping

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
function initIndex() {
return elasticClient.indices.create({
index: indexName
})
}
exports.initIndex = initIndex
function deleteIndex() {
return elasticClient.indices.delete({
index: indexName
})
}
exports.deleteIndex = deleteIndex
function initMapping() {
return elasticClient.indices.putMapping({
index: indexName,
type: 'proxy',
body: {
'_all' : { 'enabled' : false },
'properties': {
'get': { 'type': 'boolean' },
'post': { 'type': 'boolean' },
'cookies': { 'type': 'boolean' },
'referer': { 'type': 'boolean' },
'user-agent': { 'type': 'boolean' },
'anonymityLevel': { 'type': 'byte'},
'supportsHttps': { 'type': 'boolean' },
'protocol': { 'type': 'string' },
'ip': { 'type': 'string' },
'port': { 'type': 'integer' },
'ipPort': { 'type': 'string', 'index': 'no' },
'tsChecked': { 'type': 'date', 'format': 'epoch_second'},
'websites': {
'type' : 'nested',
},
'otherProtocols' : {
type: 'nested'
},
'country': { 'type': 'string' },
'curl': { 'type': 'string', 'index': 'no' },
'type': { 'type': 'string', 'index': 'no' }
}
}
})
}
exports.initMapping = initMapping

Now we can add or delete a document

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
function addDocument(id, document) {
return elasticClient.index({
index: indexName,
id: id,
type: 'proxy',
body: document
})
}
exports.addDocument = addDocument
function deleteDocument(id) {
return elasticClient.delete({
index: indexName,
type: 'proxy',
id: id
})
}
exports.deleteDocument = deleteDocument

Or add/remove several documents at once - according to ElasticSearch docs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
function bulkAdd(documents) {
const body = []
documents.forEach(d => {
body.push({index: {_index: indexName, _type: 'proxy', _id: d.id }})
body.push(d.data)
})
return elasticClient.bulk({body: body})
}
exports.bulkAdd = bulkAdd
function bulkDelete(documents) {
const body = []
documents.forEach(d => {
body.push({delete: {_index: indexName, _type: 'proxy', _id: d }})
})
return elasticClient.bulk({body: body})
}
exports.bulkDelete = bulkDelete

Finally we are able to return random item from all ElasticSearch records

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
const allowedWebsites = ['google', 'amazon', 'example']
function createBody(options) {
const body = {
query: {
bool: {
filter: {
bool: {
must: []
}
}
}
}
}
body.sort = { // sort by random to return random proxy
_script: {
script: 'Math.random() * 200000',
type: 'number',
order: 'asc'
}
}
body.size = 1
if(options.get) {
body.query.bool.filter.bool.must.push({term: {get: options.get}})
}
if(options.post) {
body.query.bool.filter.bool.must.push({term: {post: options.post}})
}
if(options.cookies) {
body.query.bool.filter.bool.must.push({term: {cookies: options.cookies}})
}
if(options['user-agent']) {
body.query.bool.filter.bool.must.push({term: {'user-agent': options['user-agent']}})
}
if(options.referer) {
body.query.bool.filter.bool.must.push({term: {referer: options.referer}})
}
if(options.anonymityLevel) {
body.query.bool.filter.bool.must.push({term: {anonymityLevel: options.anonymityLevel}})
}
if(options.supportsHttps) {
body.query.bool.filter.bool.must.push({term: {supportsHttps: options.supportsHttps}})
}
if(options.protocol) {
body.query.bool.filter.bool.must.push({term: {protocol: options.protocol}})
}
if(options.port) {
body.query.bool.filter.bool.must.push({term: {port: options.port}})
}
if(options.ip) {
body.query.bool.filter.bool.must.push({term: {ip: options.ip}})
}
if(options.country) {
body.query.bool.filter.bool.must.push({term: {country: options.country}})
}
if(options.maxCheckPeriod) {
body.query.bool.filter.bool.must.push({range: {tsChecked: {gte: Math.floor(Date.now() / 1000) - options.maxCheckPeriod}}})
}
if(options.websites) {
body.query.bool.filter.bool.must.push({
nested: {
path: 'websites',
query: {
bool: {
should: options.websites
.filter(website => allowedWebsites.indexOf(website.toLowerCase()) !== -1)
.map(website => ({ term: {['websites.'+website]: true}}))
}
}
}
})
}
return body
}
function findProxy(options) {
return elasticClient.search({
index: indexName,
type: 'proxy',
body: createBody(options)
})
}
exports.findProxy = findProxy

Other posts in Gimmeproxy tech series: