/** * Unistring: breaks a native string into an array of grapheme cluster, * and provides native string like manipulation methods. * ============================================================================= * * * @author [email protected] * @license MIT */ /* * data table, taken from: * http://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakProperty.txt */ // GENERATED CODE START >> /* * While a property defined for splitting, specially assign a value for Extended_Pictographic */ GBP['Extended_Pictographic'] = 16; /* p */ /* * data table, taken from: * http://www.unicode.org/Public/14.0.0/ucd/auxiliary/WordBreakProperty.txt */ // GENERATED CODE START >> /* * While a property defined for splitting, specially assign a value for Extended_Pictographic */ WBP['Extended_Pictographic'] = 23; /* w */ /* * data table, taken from: * http://www.unicode.org/Public/14.0.0/ucd/auxiliary/SentenceBreakProperty.txt */ // GENERATED CODE START >> /* * data table, taken from: * http://www.unicode.org/Public/14.0.0/ucd/Scripts.txt */ // GENERATED CODE START >> /* * data table, taken from: * http://www.unicode.org/Public/14.0.0/ucd/LineBreak.txt */ // GENERATED CODE START >> /* * data table, taken from: * http://www.unicode.org/Public/14.0.0/ucd/EastAsianWidth.txt */ // GENERATED CODE START >> /* * classes */ function TimelimitCache () { this.lastCleared = Date.now(); this.cache = new Map; } TimelimitCache.prototype = { TTL_MSECS: 1000 * 60, has: function (key) { return this.cache.has(key); }, get: function (key) { const result = this.cache.get(key); if (Date.now() - this.lastCleared >= this.TTL_MSECS) { this.cache.clear; this.lastCleared = Date.now(); } return result; }, set: function (key, value) { this.cache.set(key, value); } }; /* * variables */ function stub (label, creator) { return new Proxy(() => {}, { apply: (obj, thisArg, args) => { return creator().apply(thisArg, args); } }); } let graphemeFinder = stub('grapheme', () => { return graphemeFinder = createFinderWithEmoji( Uint8Array.from( GRAPHEME_BREAK_PROPS.match(/[0-9A-F]{2}/g), a => parseInt(a, 16)), GRAPHEME_BREAK_PROP_UNIT_LENGTH, GBP.Other, GBP_NAMES.length ); }); let wordFinder = stub('word', () => { return wordFinder = createFinderWithEmoji( Uint8Array.from( WORD_BREAK_PROPS.match(/[0-9A-F]{2}/g), a => parseInt(a, 16)), WORD_BREAK_PROP_UNIT_LENGTH, WBP.Other, WBP_NAMES.length ); }); let sentenceFinder = stub('sentence', () => { return sentenceFinder = createFinder( Uint8Array.from( SENTENCE_BREAK_PROPS.match(/[0-9A-F]{2}/g), a => parseInt(a, 16)), SENTENCE_BREAK_PROP_UNIT_LENGTH, SBP.Other ); }); let scriptFinder = stub('script', () => { return scriptFinder = createFinder( Uint8Array.from( SCRIPTS.match(/[0-9A-F]{2}/g), a => parseInt(a, 16)), SCRIPTS_PROP_UNIT_LENGTH, SCRIPT.Unknown ); }); let lineBreakFinder = stub('script', () => { return lineBreakFinder = createFinder( Uint8Array.from( LINE_BREAK_PROPS.match(/[0-9A-F]{2}/g), a => parseInt(a, 16)), LINE_BREAK_PROP_UNIT_LENGTH, LBP.XX ); }); let eastAsianWidthFinder = stub('script', () => { return eastAsianWidthFinder = createFinder( Uint8Array.from( EAST_ASIAN_WIDTH_PROPS.match(/[0-9A-F]{2}/g), a => parseInt(a, 16)), EAST_ASIAN_WIDTH_PROP_UNIT_LENGTH, EAW.N); }); let linkCount = 0; let graphemeClusterCache = new TimelimitCache; let wordClusterCache = new Map([[false, new TimelimitCache], [true, new TimelimitCache]]); let lineBreakableClusterCache = new TimelimitCache; const eawMap = [ /* Neutral */ 1, /* Narrow */ 1, /* Ambiguous */ 2, /* Wide */ 2, /* Half Width */ 1, /* Full Width */ 2, ]; /* * utility functions */ function pick2 (data, index) { return data[index] | data[index + 1] ch.codePointAt(0)); } function find (cp, table, units, otherValue) { let left = 0, right = ((table.length / units) >> 0) - 1; let middle, index, middlecp, length; while (left > 0; index = middle * units; middlecp = pick4(table, index + 1); length = (middlecp >> 21) & 0x7ff; middlecp = middlecp & 0x1fffff; if (middlecp + length - 1 { if (cp in cache) { return cache[cp]; } else { return cache[cp] = find(cp, table, units, otherValue); } }; } function createFinderWithEmoji (table, units, otherValue, emojiValue) { const cache = {}; return cp => { if (cp in cache) { return cache[cp]; } else { if (/^\p{Extended_Pictographic}$/u.test(String.fromCodePoint(cp))) { return cache[cp] = emojiValue; } else { return cache[cp] = find(cp, table, units, otherValue); } } }; } function startsWith (subject, candidates) { return candidates.includes(subject.substr(0, 1)); } function endsWith (subject, candidates) { return candidates.includes(subject.substr(-1)); } function getUTF16FromCodePoint (cp) { const p = (cp & 0x1f0000) >> 16; const o = cp & 0xffff; if (p) { return String.fromCharCode(0xd800 | ((p - 1) > 10)) + String.fromCharCode(0xdc00 | (o & 0x03ff)); } else { return String.fromCharCode(o); } } function getCodePointString (cp, type) { let result = ''; if (cp 0) { if (prevIndex 0) { if (prevIndex { return String.fromCharCode(CODE_OFFSET + wordFinder(cp)); }).join('') + String.fromCharCode(CODE_OFFSET + WBP.EOT); let rawIndex = 0; for ( let i = 0, goal = nextProps.length; i 0 && isInScriptWord(prevProps, nextProps, codePoints[i - 1], codePoints[i])) continue; if (prevIndex > 0; rawIndex = this[middle].rawIndex; length = this[middle].text.length; if (rawIndex + length - 1 {return {...c}}); } else { result = buildWordClusters(resolveSurrogates(s), useScripts); cache.set(s, result); } Object.defineProperty(result, 'wordIndexOf', { value: wordIndexOf }); return result; } function isInScriptWord (prev, next, prevcp, nextcp) { prev = prev.substr(-1); next = next.charAt(0); // Space × Space if (prev == 's' && next == 's') return true; // !Space ÷ Space if (prev != 's' && next == 's') return false; // Space ÷ !Space if (prev == 's' && next != 's') return false; if (/[ab]/.test(prev) || /[ab]/.test(next)) return false; return scriptFinder(prevcp) == scriptFinder(nextcp); } /* * sentence boundary handling functions/ */ function buildSentenceClusters (codePoints) { const CODE_OFFSET = 96; const result = []; let prevIndex = 0; let prevProps = ''; let nextProps = codePoints.map(cp => { return String.fromCharCode(CODE_OFFSET + sentenceFinder(cp)); }).join('') + String.fromCharCode(CODE_OFFSET + SBP.EOT); let rawIndex = 0; for ( let i = 0, goal = nextProps.length; i { let prop = lineBreakFinder(cp); // Assign a line breaking class to each code point of the input. // Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes // depending on criteria outside the scope of this algorithm. // // AI, SG, XX -> AL // SA (Only Mn or Mc) -> CM // SA (Except Mn or MC) -> AL // CJ -> NS switch (prop) { case 23: case 41: case 0: prop = 11; break; case 26: prop = /\p{gc=Mn}|\p{gc=Mc}/u.test(String.fromCodePoint(cp)) ? 3 : 11; break; case 38: prop = 30; break; } return String.fromCharCode(CODE_OFFSET + prop); }).join('') + String.fromCharCode(CODE_OFFSET + LBP.EOT); let rawIndex = 0; for ( let i = 0, goal = nextProps.length; i { return String.fromCodePoint(parseInt($2, /^x/i.test($1) ? 16 : 10)); }) } if (options.ansi) { const pattern = /\u001b\[.*?[\u0040-\u007e]|\u001b[\]P].+?(?:\u0007|\u001b\\)|\u001b[\u0040-\u005f]/g; let re, plainIndex = 0; while ((re = pattern.exec(s)) !== null) { if (re.index > plainIndex) { result += getColumnsFor.plain(s.substring(plainIndex, re.index), options.awidth); } plainIndex = re.index + re[0].length; } if (plainIndex { /* * special handling of halfwidth katakana voiced/semi-voiced marks: * should be considered as a single grapheme, not a modifier. * * TBD: there may be other characters that require special handling */ s = s.replace(/[\uff9e\uff9f]/g, '_'); const oldawidth = eawMap[2]; let result = 0; if (awidth) { eawMap[2] = awidth; } Unistring(s).forEach(clusters => { result += eawMap[eastAsianWidthFinder(clusters.codePoints[0])] || 0; }); eawMap[2] = oldawidth; return result; }; function normalizeHyperlinks (lines) { const pattern = /\x1b\]8;([^;]*);([^\x07]*)\x07/g; const lastLinkStart = { line: -1, index: -1, length: -1, p2: null, p3: null }; for (let i = 0; i = 0) { // single line link if (lastLinkStart.line == i) { ; } // multiple line link else { const {line, index, length, p2, p3} = lastLinkStart; const linkId = p2 || `id=_${Date.now()}_${linkCount++}`; const linkStart = `\x1b]8;${linkId};${p3}\x07`; const linkEnd = `\x1b]8;;\x07`; // update middle lines for (let j = i - 1; j > line; j--) { lines[j] = `${linkStart}${lines[j]}${linkEnd}`; } // update head line lines[line] = `${lines[line].substring(0, index)}${linkStart}${lines[line].substring(index + length)}${linkEnd}`; // update bottom line lines[i] = `${linkStart}${lines[i]}`; pattern.lastIndex += linkStart.length; } lastLinkStart.line = -1; lastLinkStart.index = -1; lastLinkStart.length = -1; lastLinkStart.p2 = null; lastLinkStart.p3 = null; } } } return lines; } function divideByColumns (s, columns, options = {}) { if (options.characterReference) { s = s.replace(/([xX])?([^;]+);/g, ($0, $1, $2) => { return String.fromCodePoint(parseInt($2, /^x/i.test($1) ? 16 : 10)); }) } if (columns plainIndex) { Unistring(s.substring(plainIndex, re.index)).forEach(cluster => { clusters.push([ cluster.rawString, getColumnsFor.plain(cluster.rawString, options.awidth) ]); }); } clusters.push([re[0], 0]); plainIndex = re.index + re[0].length; } if (plainIndex { clusters.push([ cluster.rawString, getColumnsFor.plain(cluster.rawString, options.awidth) ]); }); } let result = ''; let leftColumns = 0; for (let i = 0; i columns) { return normalizeHyperlinks([ clusters.slice(0, i).map(c => c[0]).join(''), clusters.slice(i).map(c => c[0]).join('') ]); } leftColumns += graphemeColumn; } return [s, '']; } else { return divideByColumns.plain(s, columns, options.awidth); } } divideByColumns.plain = (s, columns, awidth) => { if (columns columns) { return [ u.slice(0, i).toString(), u.slice(i).toString() ]; } leftColumns += graphemeColumn; } return [s, '']; }; function getLineBreakableClusters (s) { if (lineBreakableClusterCache.has(s)) { return lineBreakableClusterCache.get(s).map(c => {return {...c}}); } else { const result = buildLineBreakableClusters(resolveSurrogates(s)); lineBreakableClusterCache.set(s, result); return result; } } function getFoldedLines (s, options = {}) { function fetchPlainClusters (line) { const result = []; const clusters = getLineBreakableClusters(line); for (const cluster of clusters) { result.push([cluster.text, getColumnsFor.plain(cluster.text, options.awidth)]); } return result; } function fetchAnsiClusters (line) { const result = []; /* * group 1: SGR reset sequence * ESC [ m * * group 2: SGR (Select Graphics Rendition) sequences * ESC [ ... m * * group 3-1: Other CSI (Control Sequence Introducer) sequences, except SGR * ESC [ ... * * group 3-2: OSC (Operation System Command) sequences * or DCS (Device Control String) sequences * ST (String Terminator): * BEL | ( ESC \ ) * OSC sequences * ESC ] ... ST * DCS sequences * ESC P ... ST * * group 3-3: Other Fe sequences, except OSC,DCS * ESC ... */ const pattern = /(\u001b\[0*m)|(\u001b\[.*?m)|(\u001b\[.*?[\u0040-\u007e]|\u001b[\]P].+?(?:\u0007|\u001b\\)|\u001b[\u0040-\u005f])/g; let re, plainIndex = 0; while ((re = pattern.exec(line)) !== null) { if (re.index > plainIndex) { const clusters = getLineBreakableClusters(line.substring(plainIndex, re.index)); for (const cluster of clusters) { result.push([cluster.text, getColumnsFor.plain(cluster.text, options.awidth)]); } } if (re[1]) { result.push([re[1], 0, 2]); } else if (re[2]) { result.push([re[2], 0, 1]); } else { result.push([re[3], 0]); } plainIndex = re.index + re[0].length; } if (plainIndex { return String.fromCodePoint(parseInt($2, /^x/i.test($1) ? 16 : 10)); }) ); } function fetchAnsiCharRefClusters (line) { return fetchAnsiClusters( line.replace(/([xX])?([^;]+);/g, ($0, $1, $2) => { return String.fromCodePoint(parseInt($2, /^x/i.test($1) ? 16 : 10)); }) ); } function esc (s) { return s .replace(/[\x00-\x1f]/g, $0 => { return '\x1b[1;36m^' + String.fromCharCode($0.charCodeAt(0) + 64) + '\x1b[m'; }); } const columnsSource = options.columns || 80; const result = []; let fetchClusters = fetchPlainClusters; let fetchColumns; if (Array.isArray(columnsSource) && columnsSource.length) { fetchColumns = () => { const index = result.length; const columns = index columnsSource; } if (options.ansi && options.characterReference) { fetchClusters = fetchAnsiCharRefClusters; } else if (options.ansi) { fetchClusters = fetchAnsiClusters; } else if (options.characterReference) { fetchClusters = fetchCharRefClusters; } while (s != '') { let line = /^(.*?)(\r?\n)/.exec(s), newline; if (line) { s = s.substring(line[0].length); newline = line[2]; line = line[1]; } else { newline = ''; line = s; s = ''; } const breakableClusters = fetchClusters(line); let lineColumns = 0; let lineFragment = ''; let sgrSequence = ''; let columns = fetchColumns(); for (let i = 0; i columns) { if (clusterColumns > columns) { let [lead, rest] = divideByColumns.plain( clusterText, columns - lineColumns, options.awidth); if (sgrSequence !== '') { result.push(lineFragment + lead + '\u001b[m'); } else { result.push(lineFragment + lead); } if (rest != '') { breakableClusters.splice( i + 1, 0, [rest, getColumnsFor.plain(rest, options.awidth)]); } lineColumns = 0; lineFragment = sgrSequence; } else { if (sgrSequence !== '') { result.push(lineFragment + '\u001b[m'); } else { result.push(lineFragment); } lineColumns = clusterColumns; lineFragment = sgrSequence + clusterText; } columns = fetchColumns(); } else { lineColumns += clusterColumns; lineFragment += clusterText; } } if (lineFragment !== '') { result.push(lineFragment); } if (newline !== '') { if (result.length && breakableClusters.length) { result[result.length - 1] += newline; } else { result.push(newline); } } } return options.ansi ? normalizeHyperlinks(result) : result; } /* * Grapheme class */ function Grapheme (codePoints, rawIndex) { if (codePoints != undefined) { this.codePoints = codePoints; this.updateRawString(); } if (rawIndex != undefined) { this.rawIndex = rawIndex; } } Grapheme.prototype = { toString: function () { return this.rawString; }, clone: function () { const result = new Grapheme; result.codePoints = this.codePoints.slice(); result.rawString = this.rawString; result.rawIndex = this.rawIndex; return result; }, updateRawString: function () { this.rawString = this.codePoints.reduce((result, cp) => { return result + getUTF16FromCodePoint(cp); }, ''); }, dump: function (detail) { if (detail) { const log = []; log.push('codePoints: [' + this.codePoints.map(cp => { return getCodePointString(cp, 'unicode'); }).join(', ') + ']'); log.push(' rawIndex: ' + this.rawIndex); log.push(' rawString: (' + this.rawString.length + ') "' + this.rawString + '"'); return log.join('\n'); } else { return this.codePoints .map(getCodePointString) .join(' × '); } } }; /* * Unistring class */ function Unistring (s) { if (!(this instanceof Unistring)) { return new Unistring(s); } if (typeof s == 'string') { if (graphemeClusterCache.has(s)) { this.clusters = graphemeClusterCache.get(s).map(g => g.clone()); } else { graphemeClusterCache.set( s, this.clusters = buildGraphemeClusters(resolveSurrogates(s))); } } else if (s instanceof Array) { this.clusters = []; let rawIndex = 0; for (let i = 0, goal = s.length; i { log.push('*** Grapheme Cluster #' + index + ' ***'); log.push(g.dump(detail)); }); return log.join('\n'); } else { return '÷ ' + this.clusters .map(g => g.dump(detail)) .join(' ÷ ') + ' ÷'; } }, toString: function () { return this.clusters.reduce((result, g) => result + g.toString(), ''); }, delete: function (start, length) { start = this._ensureIndex(start); if (length == undefined || start + length > this.clusters.length) { length = this.clusters.length - start; } length = Math.max(0, length); let delta = 0; for (let i = start, goal = start + length; i = this.clusters.length) return undefined; return this.clusters[index].codePoints; }, clusterAt: function (index) { return this.rawStringAt.apply(this, arguments); }, rawStringAt: function (index) { index = this._ensureIndex(index); if (index = this.clusters.length) return ''; return this.clusters[index].rawString; }, rawIndexAt: function (index) { index = this._ensureIndex(index); if (index this.clusters.length) return NaN; if (index == this.clusters.length) { return this.clusters[index - 1].rawIndex + this.clusters[index - 1].rawString.length; } return this.clusters[index].rawIndex; }, forEach: function () { this.clusters.forEach.apply(this.clusters, arguments); }, map: function () { return this.clusters.map.apply(this.clusters, arguments); }, getClusterIndexFromUTF16Index: function (index) { let left = 0, right = this.clusters.length - 1; let middle, rawIndex, length; if (right >= 0 && index == this.clusters[right].rawIndex + this.clusters[right].rawString.length) { return right + 1; } while (left > 0; rawIndex = this.clusters[middle].rawIndex; length = this.clusters[middle].rawString.length; if (rawIndex + length - 1 = this.clusters.length) return ''; return this.clusters[index].rawString.charAt(0); }, charCodeAt: function (index) { if (index = this.clusters.length) return NaN; return this.clusters[index].codePoints[0]; }, substring: function (start, end) { if (start == undefined) { start = 0; } if (end == undefined) { end = this.clusters.length; } start = Math.max(0, Math.min(start, this.clusters.length)); end = Math.max(0, Math.min(end, this.clusters.length)); if (start > end) { const tmp = start; start = end; end = tmp; } return new Unistring(this.clusters.slice(start, end)); }, substr: function (start, length) { start = this._ensureIndex(start); if (length == undefined || start + length > this.clusters.length) { length = this.clusters.length - start; } if (length = 0) { while (clusterIndex = this.clusters.length) { return -1; } if (this.substr(clusterIndex, s.length).toString() == part) { return clusterIndex; } rawIndex++; } return -1; }, lastIndexOf: function (s) { s = this._toUnistring(s, 'lastIndexOf'); const whole = this.toString(); const part = s.toString(); let rawIndex = whole.length - 1; let clusterIndex = this.clusters.length - 1; while (rawIndex >= 0 && (rawIndex = whole.lastIndexOf(part, rawIndex)) >= 0) { while (clusterIndex >= 0 && this.clusters[clusterIndex].rawIndex > rawIndex) { clusterIndex--; } if (clusterIndex { return eawMap[2]; }, set: value => { if (value === 1 || value === 2) { eawMap[2] = value; } } }, printCacheStatus: { value: () => { console.log([ `clusterCache.size: ${clusterCache.size}`, ` request count: ${clusterCacheRequestCount}`, ` hit count: ${clusterCacheHitCount}`, ` miss count: ${clusterCacheMissCount}` ].join('\n')); } } }); export default Unistring; // vim:set ts=4 sw=4 fenc=UTF-8 ff=unix ft=javascript fdm=marker fmr=>> :