Skip to content

Commit

Permalink
Introduce two new options to enabled/disabled the Private/ICANN domains.
Browse files Browse the repository at this point in the history
* 'allowIcann' set to 'false' will ignore the ICANN section of the list.
* 'allowPrivate' set to 'false' will ignore the PRIVATE section of the list.
  • Loading branch information
remusao committed Mar 19, 2018
1 parent 1f257c4 commit 13ed3e4
Show file tree
Hide file tree
Showing 7 changed files with 257 additions and 137 deletions.
53 changes: 39 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,37 @@ isValidHostname('192.168.0.0') // returns `true`

# Troubleshooting

## Ignoring Private domain section of public suffix list

Because `tld.js` relies on public suffix list to parse URLs and hostnames, you
might encounter counter-intuitive results from time to time. Most of these
results stems from the fact that public suffix list contains two sections: ICANN
domains and Private domains. The *ICANN* section defines what you would expect to
see as top-level domains most of the time: `.com`, `.net`, `co.uk`, etc. On the
other hand, the *Private* section contains rules such as: `global.prod.fastly.net`
or `s3.amazonaws.com`. This means that these values can appear as `publicSuffix`.

Fortunately, you can ask `tld.js` to ignore the *Private* section completely:
```js
const tldjs = require('tldjs');

tldjs.getDomain('www.s3.amazonaws.com'); // returns 'www.s3.amazonaws.com'
tldjs.getDomain('https://global.prod.fastly.net'); // returns null

const myTldjs = tldjs.fromUserSettings({
allowPrivate: false,
});

myTldjs.getDomain('www.s3.amazonaws.com'); // returns 'amazonaws.com'
myTldjs.getDomain('https://global.prod.fastly.net'); // returns 'fastly.net'
```

## Retrieving subdomain of `localhost` and custom hostnames

`tld.js` methods `getDomain` and `getSubdomain` are designed to **work only with *known and valid* TLDs**.
This way, you can trust what a domain is.

`localhost` is a valid hostname but not a TLD. Although you can instanciate your own flavour of `tld.js` with *additional valid hosts*:
`localhost` is a valid hostname but not a TLD. Although you can instantiate your own flavour of `tld.js` with *additional valid hosts*:

```js
const tldjs = require('tldjs');
Expand Down Expand Up @@ -225,28 +250,28 @@ use-case. Because the library tried to be smart, the speed can be drastically
different depending on the input (it will be faster if you provide an already
cleaned hostname, compared to a random URL).

On an Intel i7-6600U (2,60-3,40 GHz):
On an Intel i7-6600U (2,60-3,40 GHz) using Node.js `v9.8.0`:

## For already cleaned hostnames

| Methods | ops/sec |
| --- | --- |
| `isValidHostname` | ~`8,700,000` |
| `extractHostname` | ~`8,100,000` |
| `tldExists` | ~`2,000,000` |
| `getPublicSuffix` | ~`1,130,000` |
| `getDomain` | ~`1,000,000` |
| `getSubdomain` | ~`1,000,000` |
| `parse` | ~`850,000` |
| Methods | ops/sec |
| --- | --- |
| `isValidHostname` | ~`25,300,000` |
| `extractHostname` | ~`20,000,000` |
| `tldExists` | ~`3,200,000` |
| `getPublicSuffix` | ~`1,200,000` |
| `getDomain` | ~`1,100,000` |
| `getSubdomain` | ~`1,100,000` |
| `parse` | ~`870,000` |


## For random URLs

| Methods | ops/sec |
| --- | --- |
| `isValidHostname` | ~`25,400,000` |
| `extractHostname` | ~`400,000` |
| `tldExists` | ~`310,000` |
| `isValidHostname` | ~`50,000,000` |
| `extractHostname` | ~`360,000` |
| `tldExists` | ~`300,000` |
| `getPublicSuffix` | ~`240,000` |
| `getDomain` | ~`240,000` |
| `getSubdomain` | ~`240,000` |
Expand Down
3 changes: 2 additions & 1 deletion bin/benchmark.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
'use strict';

var tld = require('../index.js');
var isIp = require('../lib/is-ip.js');
var Benchmark = require('benchmark');


Expand Down Expand Up @@ -67,7 +68,7 @@ function bench(values) {
new Benchmark.Suite()
.add('tldjs#isIp', () => {
for (var i = 0; i < values.length; i += 1) {
tld.isIp(values[i]);
isIp(values[i]);
}
})
.add('tldjs#isValid', () => {
Expand Down
21 changes: 19 additions & 2 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,17 @@ function factory(options) {
var validHosts = options.validHosts || [];
var _extractHostname = options.extractHostname || extractHostname;

// Customize ICANN/Private domain
var allowIcann = true;
if (options.allowIcann !== undefined) {
allowIcann = options.allowIcann;
}

var allowPrivate = true;
if (options.allowPrivate !== undefined) {
allowPrivate = options.allowPrivate;
}

/**
* Process a given url and extract all information. This is a higher level API
* around private functions of `tld.js`. It allows to remove duplication (only
Expand All @@ -54,6 +65,8 @@ function factory(options) {
isIp: null,
tldExists: false,
publicSuffix: null,
// isIcann: false,
// isPrivate: false,
domain: null,
subdomain: null,
};
Expand Down Expand Up @@ -82,7 +95,11 @@ function factory(options) {
if (step === TLD_EXISTS) { return result; }

// Extract public suffix
result.publicSuffix = getPublicSuffix(rules, result.hostname);
result.publicSuffix = getPublicSuffix(
rules,
result.hostname,
allowIcann,
allowPrivate).publicSuffix;
if (step === PUBLIC_SUFFIX) { return result; }

// Extract domain
Expand All @@ -109,7 +126,7 @@ function factory(options) {
return parse(url, TLD_EXISTS).tldExists;
},
getPublicSuffix: function (url) {
return parse(url, PUBLIC_SUFFIX).publicSuffix;
return parse(url, PUBLIC_SUFFIX, allowIcann, allowPrivate).publicSuffix;
},
getDomain: function (url) {
return parse(url, DOMAIN).domain;
Expand Down
107 changes: 48 additions & 59 deletions lib/parsers/publicsuffix-org.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,72 +6,61 @@ var SuffixTrie = require('../suffix-trie.js');
var PublicSuffixOrgParser = {};


/**
* Filters a commented or empty line
*
* @param {string} row
* @return {string|null}
*/
function keepOnlyRules(row) {
var trimmed = row.trim();
if (trimmed.length === 0 || trimmed.indexOf('//') === 0) {
return null;
}

// TODO - Ignore leading or trailing dot

return trimmed;
}


/**
* Returns a rule based on string analysis
*
* @param {string} row
* @return {object} a public suffix rule
*/
function domainBuilder(row) {
var rule = {
exception: false,
source: null,
parts: null,
};

// Only read line up to the first white-space
var spaceIndex = row.indexOf(' ');
if (spaceIndex !== -1) {
row = row.substr(0, spaceIndex);
}

row = punycode.toASCII(row);

// Keep track of initial rule
rule.source = row;

// Exception
if (row[0] === '!') {
row = row.substr(1);
rule.exception = true;
}

rule.parts = row.split('.').reverse();

return rule;
}


/**
* Parse a one-domain-per-line file
*
* @param body {String}
* @return {Array}
*/
PublicSuffixOrgParser.parse = function (body) {
return new SuffixTrie((body + '')
.split('\n')
.map(keepOnlyRules)
.filter(function (r) { return r !== null; })
.map(domainBuilder));
var beginPrivateDomains = '// ===BEGIN PRIVATE DOMAINS===';
var lines = ('' + body).split('\n');

var rules = [];
var isIcann = true;

for (var i = 0; i < lines.length; i += 1) {
var line = lines[i].trim();

// Ignore empty lines
if (line.length === 0) { continue; }

// Comment (check for beginning of Private domains section)
if (line.startsWith('//')) {
if (line.startsWith(beginPrivateDomains)) {
isIcann = false;
}

continue;
}

// TODO - Ignore leading or trailing dot

// Only read line up to the first white-space
var spaceIndex = line.indexOf(' ');
if (spaceIndex !== -1) {
line = line.substr(0, spaceIndex);
}

// Convert to punycode
line = punycode.toASCII(line);

// Check if the rule is an exception
var exception = false;
if (line[0] === '!') {
line = line.substr(1);
exception = true;
}

rules.push({
isIcann: isIcann,
exception: exception,
source: lines[i],
parts: line.split('.').reverse(),
});
}

return new SuffixTrie(rules);
};


Expand Down
16 changes: 12 additions & 4 deletions lib/public-suffix.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,25 @@ var extractTldFromHost = require('./from-host.js');
* @param {string} hostname
* @return {string}
*/
module.exports = function getPublicSuffix(rules, hostname) {
module.exports = function getPublicSuffix(rules, hostname, allowIcann, allowPrivate) {
// First check if `hostname` is already a valid top-level Domain.
if (rules.hasTld(hostname)) {
return hostname;
return {
publicSuffix: hostname,
isIcann: false,
isPrivate: false,
};
}

var candidate = rules.suffixLookup(hostname);
var candidate = rules.suffixLookup(hostname, allowIcann, allowPrivate);
if (candidate === null) {
// Prevailing rule is '*' so we consider the top-level domain to be the
// public suffix of `hostname` (e.g.: 'example.org' => 'org').
return extractTldFromHost(hostname);
return {
isIcann: false,
isPrivate: false,
publicSuffix: extractTldFromHost(hostname),
};
}

return candidate;
Expand Down
Loading

0 comments on commit 13ed3e4

Please sign in to comment.