You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

310 lines
8.1 KiB

5 years ago
/* eslint no-bitwise: "off", max-statements: "off", max-lines: "off" */
// Taken from: https://github.com/walling/unorm/blob/master/lib/unorm.js
/*
* UnicodeNormalizer 1.0.0
* Copyright (c) 2008 Matsuza
* Dual licensed under the MIT (MIT-LICENSE.txt) and
* GPL (GPL-LICENSE.txt) licenses.
* $Date: 2008-06-05 16:44:17 +0200 (Thu, 05 Jun 2008) $
* $Rev: 13309 $
*/
"use strict";
var primitiveSet = require("../../../object/primitive-set")
, validValue = require("../../../object/valid-value")
, data = require("./_data");
var floor = Math.floor
, forms = primitiveSet("NFC", "NFD", "NFKC", "NFKD")
, DEFAULT_FEATURE = [null, 0, {}]
, CACHE_THRESHOLD = 10
, SBase = 0xac00
, LBase = 0x1100
, VBase = 0x1161
, TBase = 0x11a7
, LCount = 19
, VCount = 21
, TCount = 28
, NCount = VCount * TCount
, SCount = LCount * NCount
, UChar
, cache = {}
, cacheCounter = []
, fromCache
, fromData
, fromCpOnly
, fromRuleBasedJamo
, fromCpFilter
, strategies
, UCharIterator
, RecursDecompIterator
, DecompIterator
, CompIterator
, createIterator
, normalize;
UChar = function (cp, feature) {
this.codepoint = cp;
this.feature = feature;
};
// Strategies
(function () { for (var i = 0; i <= 0xff; ++i) cacheCounter[i] = 0; })();
fromCache = function (nextStep, cp, needFeature) {
var ret = cache[cp];
if (!ret) {
ret = nextStep(cp, needFeature);
if (Boolean(ret.feature) && ++cacheCounter[(cp >> 8) & 0xff] > CACHE_THRESHOLD) {
cache[cp] = ret;
}
}
return ret;
};
fromData = function (next, cp) {
var hash = cp & 0xff00, dunit = UChar.udata[hash] || {}, feature = dunit[cp];
return feature ? new UChar(cp, feature) : new UChar(cp, DEFAULT_FEATURE);
};
fromCpOnly = function (next, cp, needFeature) {
return needFeature ? next(cp, needFeature) : new UChar(cp, null);
};
fromRuleBasedJamo = function (next, cp, needFeature) {
var char, base, i, arr, SIndex, TIndex, feature, j;
if (cp < LBase || (LBase + LCount <= cp && cp < SBase) || SBase + SCount < cp) {
return next(cp, needFeature);
}
if (LBase <= cp && cp < LBase + LCount) {
char = {};
base = (cp - LBase) * VCount;
for (i = 0; i < VCount; ++i) {
char[VBase + i] = SBase + TCount * (i + base);
}
arr = new Array(3);
arr[2] = char;
return new UChar(cp, arr);
}
SIndex = cp - SBase;
TIndex = SIndex % TCount;
feature = [];
if (TIndex === 0) {
feature[0] = [LBase + floor(SIndex / NCount), VBase + floor((SIndex % NCount) / TCount)];
feature[2] = {};
for (j = 1; j < TCount; ++j) {
feature[2][TBase + j] = cp + j;
}
} else {
feature[0] = [SBase + SIndex - TIndex, TBase + TIndex];
}
return new UChar(cp, feature);
};
fromCpFilter = function (next, cp, needFeature) {
return cp < 60 || (cp > 13311 && cp < 42607)
? new UChar(cp, DEFAULT_FEATURE)
: next(cp, needFeature);
};
strategies = [fromCpFilter, fromCache, fromCpOnly, fromRuleBasedJamo, fromData];
UChar.fromCharCode = strategies.reduceRight(function (next, strategy) {
return function (cp, needFeature) { return strategy(next, cp, needFeature); };
}, null);
UChar.isHighSurrogate = function (cp) { return cp >= 0xd800 && cp <= 0xdbff; };
UChar.isLowSurrogate = function (cp) { return cp >= 0xdc00 && cp <= 0xdfff; };
UChar.prototype.prepFeature = function () {
if (!this.feature) {
this.feature = UChar.fromCharCode(this.codepoint, true).feature;
}
};
UChar.prototype.toString = function () {
var num;
if (this.codepoint < 0x10000) return String.fromCharCode(this.codepoint);
num = this.codepoint - 0x10000;
return String.fromCharCode(floor(num / 0x400) + 0xd800, (num % 0x400) + 0xdc00);
};
UChar.prototype.getDecomp = function () {
this.prepFeature();
return this.feature[0] || null;
};
UChar.prototype.isCompatibility = function () {
this.prepFeature();
return Boolean(this.feature[1]) && this.feature[1] & (1 << 8);
};
UChar.prototype.isExclude = function () {
this.prepFeature();
return Boolean(this.feature[1]) && this.feature[1] & (1 << 9);
};
UChar.prototype.getCanonicalClass = function () {
this.prepFeature();
return this.feature[1] ? this.feature[1] & 0xff : 0;
};
UChar.prototype.getComposite = function (following) {
var cp;
this.prepFeature();
if (!this.feature[2]) return null;
cp = this.feature[2][following.codepoint];
return cp ? UChar.fromCharCode(cp) : null;
};
UCharIterator = function (str) {
this.str = str;
this.cursor = 0;
};
UCharIterator.prototype.next = function () {
if (Boolean(this.str) && this.cursor < this.str.length) {
var cp = this.str.charCodeAt(this.cursor++), d;
if (
UChar.isHighSurrogate(cp) &&
this.cursor < this.str.length &&
UChar.isLowSurrogate((d = this.str.charCodeAt(this.cursor)))
) {
cp = (cp - 0xd800) * 0x400 + (d - 0xdc00) + 0x10000;
++this.cursor;
}
return UChar.fromCharCode(cp);
}
this.str = null;
return null;
};
RecursDecompIterator = function (it, cano) {
this.it = it;
this.canonical = cano;
this.resBuf = [];
};
RecursDecompIterator.prototype.next = function () {
var recursiveDecomp, uchar;
recursiveDecomp = function (cano, ucharLoc) {
var decomp = ucharLoc.getDecomp(), ret, i, a, j;
if (Boolean(decomp) && !(cano && ucharLoc.isCompatibility())) {
ret = [];
for (i = 0; i < decomp.length; ++i) {
a = recursiveDecomp(cano, UChar.fromCharCode(decomp[i]));
// Ret.concat(a); //<-why does not this work?
// following block is a workaround.
for (j = 0; j < a.length; ++j) ret.push(a[j]);
}
return ret;
}
return [ucharLoc];
};
if (this.resBuf.length === 0) {
uchar = this.it.next();
if (!uchar) return null;
this.resBuf = recursiveDecomp(this.canonical, uchar);
}
return this.resBuf.shift();
};
DecompIterator = function (it) {
this.it = it;
this.resBuf = [];
};
DecompIterator.prototype.next = function () {
var cc, uchar, inspt, uchar2, cc2;
if (this.resBuf.length === 0) {
do {
uchar = this.it.next();
if (!uchar) break;
cc = uchar.getCanonicalClass();
inspt = this.resBuf.length;
if (cc !== 0) {
for (inspt; inspt > 0; --inspt) {
uchar2 = this.resBuf[inspt - 1];
cc2 = uchar2.getCanonicalClass();
// eslint-disable-next-line max-depth
if (cc2 <= cc) break;
}
}
this.resBuf.splice(inspt, 0, uchar);
} while (cc !== 0);
}
return this.resBuf.shift();
};
CompIterator = function (it) {
this.it = it;
this.procBuf = [];
this.resBuf = [];
this.lastClass = null;
};
CompIterator.prototype.next = function () {
var uchar, starter, composite, cc;
while (this.resBuf.length === 0) {
uchar = this.it.next();
if (!uchar) {
this.resBuf = this.procBuf;
this.procBuf = [];
break;
}
if (this.procBuf.length === 0) {
this.lastClass = uchar.getCanonicalClass();
this.procBuf.push(uchar);
} else {
starter = this.procBuf[0];
composite = starter.getComposite(uchar);
cc = uchar.getCanonicalClass();
if (Boolean(composite) && (this.lastClass < cc || this.lastClass === 0)) {
this.procBuf[0] = composite;
} else {
if (cc === 0) {
this.resBuf = this.procBuf;
this.procBuf = [];
}
this.lastClass = cc;
this.procBuf.push(uchar);
}
}
}
return this.resBuf.shift();
};
createIterator = function (mode, str) {
switch (mode) {
case "NFD":
return new DecompIterator(new RecursDecompIterator(new UCharIterator(str), true));
case "NFKD":
return new DecompIterator(new RecursDecompIterator(new UCharIterator(str), false));
case "NFC":
return new CompIterator(
new DecompIterator(new RecursDecompIterator(new UCharIterator(str), true))
);
case "NFKC":
return new CompIterator(
new DecompIterator(new RecursDecompIterator(new UCharIterator(str), false))
);
default:
throw new Error(mode + " is invalid");
}
};
normalize = function (mode, str) {
var it = createIterator(mode, str), ret = "", uchar;
while ((uchar = it.next())) ret += uchar.toString();
return ret;
};
/* Unicode data */
UChar.udata = data;
module.exports = function (/* Form*/) {
var str = String(validValue(this)), form = arguments[0];
if (form === undefined) form = "NFC";
else form = String(form);
if (!forms[form]) throw new RangeError("Invalid normalization form: " + form);
return normalize(form, str);
};