Add CoNNL-U language support, see PrismJS#3790

Querela · Jul 11, 2024 · 768c8f1 · 768c8f1
1 parent 59e5a34
commit 768c8f1
Show file tree

Hide file tree

Showing 9 changed files with 1,561 additions and 0 deletions.
diff --git a/components.json b/components.json
@@ -342,6 +342,10 @@
 			"alias": "conc",
 			"owner": "jasontatton"
 		},
+		"conllu": {
+			"title": "CoNLL-U",
+			"owner": "Querela"
+		},
 		"csp": {
 			"title": "Content-Security-Policy",
 			"owner": "ScottHelme"

diff --git a/components/prism-conllu.js b/components/prism-conllu.js
@@ -0,0 +1,162 @@
+(function (Prism) {
+
+	Prism.languages.conllu = {
+		// comment lines
+		comment: {
+			pattern: /#(?:[^\n])*/,
+			inside: {
+				metadata: {
+					pattern: /(?:\w+)\s*=\s*.*/,
+					inside: {
+						key: {
+							pattern: /\w+(?=\s*=)/,
+							alias: 'property',
+						},
+						value: {
+							pattern: /(\s*=\s*)\S.*$/,
+							lookbehind: true,
+							alias: 'string',
+						},
+						operator: /[=]/,
+					}
+				},
+				punctuation: /^#/,
+			}
+		},
+		// separator between two sentence blocks
+		"sentence-separator": {
+			pattern: /(\r?\n)(?=\r?\n)/s,
+			lookbehind: true,
+		},
+		// word lines
+		token: {
+			pattern: /.+/,
+			inside: {
+				id: {
+					pattern: /^\d+(?:[.-]\d+)?/,
+					alias: 'number',
+				},
+				// form / lemma / upos / xpos / feats / head / deprel / deps / misc
+				value: {
+					pattern: /^(\t)[^\t]*(?=\t|$)/,
+					lookbehind: true,
+					// alias: 'string',
+					// inside: {
+					// 	unspecified: /_/,
+					// }
+				},
+			},
+		},
+	};
+
+	const featKeyExp = /[A-Z][A-Za-z0-9]*(?:\[[a-z0-9]+\])?/;
+	const featValueExp = /.+/; // we just want everything here ... not /[A-Z0-9][A-Za-z0-9]*/;
+	const featsGrammar = {
+		punctuation: /\|/,
+		feature: {
+			pattern: RegExp('^' + featKeyExp.source + '=' + '.*' + '$'),
+			inside: {
+				key: {
+					pattern: RegExp(featKeyExp.source + '(?==)'), // /\w+(?==)/,
+					alias: 'property',
+				},
+				value: [
+					{
+						pattern: /(=)(?:yes|no)$/i,
+						lookbehind: true,
+						alias: 'boolean',
+					}, {
+						pattern: RegExp('(=)' + featValueExp.source + '$'), // /(=).+$/,
+						lookbehind: true,
+						alias: 'string',
+					}
+				],
+				operator: /=/,
+			},
+		},
+	};
+
+	const relationExp = /^[a-z]+(:[a-z]+)?(:[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(_[\p{Ll}\p{Lm}\p{Lo}\p{M}]+)*)?(:[a-z]+)?$/;
+	const depsGrammar = {
+		punctuation: /\|/,
+		dep: {
+			pattern: /^\S+$/,
+			inside: {
+				head: {
+					pattern: /\d+(?=:)/,
+					alias: 'number',
+				},
+				punctuation: /^:/,
+				relation: {
+					pattern: /.+/, // we just capture everything, should be ok
+					alias: 'symbol',
+				},
+			}
+		},
+	}
+
+	// hook to assign roles to value fields
+	const entryTypes = ['form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'];
+	const entryTypesAlias = [null, null, 'symbol', 'symbol', null, 'number', 'symbol', null, null];
+	const entryTypeInside = [null, null, null, null, featsGrammar, null, null, depsGrammar, featsGrammar];
+	Prism.hooks.add('after-tokenize', function (env) {
+		if (env.language !== 'conllu') {
+			return;
+		}
+
+		for (const row of env.tokens) {
+			// go over each token row (if it is a "token" and not a comment/sentence-separator)
+			if (row.type === 'token') {
+				let entryTypeCounter = 0;
+				for (const field of row.content) {
+					// skip space between
+					if (typeof field === 'string') { continue; }
+					// only fields, not ids
+					if (field?.type !== 'value') { continue; }
+
+					if (field.alias === undefined) { field.alias = []; }
+					if (typeof field.alias === 'string') { field.alias = [field.alias]; }
+
+					// check if "_" value, and assign class
+					if (field.content === '_') {
+						field.alias.push('unspecified');
+					}
+
+					// assign role to value based on position
+					if (entryTypeCounter < entryTypes.length) {
+						// add "value" as one alias
+						field.alias.push(field.type);
+						// change field type
+						field.type = entryTypes[entryTypeCounter];
+						// add alias if available
+						if (entryTypesAlias[entryTypeCounter] !== null) {
+							field.alias.push(entryTypesAlias[entryTypeCounter]);
+						} else if (entryTypeInside[entryTypeCounter] === null) {
+							// only assign string if there is no inner processing?
+							field.alias.push('string');
+						}
+
+						// run inner processing only for selected types!
+						if (field.content !== '_' && entryTypeInside[entryTypeCounter] !== null) {
+							field.content = Prism.tokenize(field.content, entryTypeInside[entryTypeCounter]);
+						}
+					}
+
+					entryTypeCounter++;
+				}
+			}
+		}
+	});
+
+	// just to have the classes listed on /faq.html#how-do-i-know-which-tokens-i-can-style-for
+	// insert dummy rules that do not match anything
+	// TODO: unsure about possible performance hit? - there should not be anything left to match but regex matching steps increase linearly with input string length ...
+	// for (let index = 0; index < entryTypes.length; index++) {
+	// 	const entryType = entryTypes[index];
+	// 	const entryTypeAlias = entryTypesAlias[index];
+	// 	const name = 'value.' + entryType + (entryTypeAlias !== null ? '.' + entryTypeAlias : '');
+	// 	// use some invalid pattern
+	// 	Prism.languages.conllu.token.inside[name] = /\b\B/;
+	// }
+
+}(Prism));
diff --git a/examples/prism-conllu.html b/examples/prism-conllu.html
@@ -0,0 +1,119 @@
+<p>Full details can be fround at <a href="https://universaldependencies.org/format.html" target="_blank">Universal Dependencies - Format</a>.</p>
+
+<h2>Comments</h2>
+
+<pre><code># sent_id = 2
+# text = I have no clue.
+# or a simple string</code></pre>
+
+<h2>Full Example</h2>
+
+<pre><code># sent_id = 2
+# text = I have no clue.
+1	I	I	PRON	PRP	Case=Nom|Number=Sing|Person=1	2	nsubj	_	_
+2	have	have	VERB	VBP	Number=Sing|Person=1|Tense=Pres	0	root	_	_
+3	no	no	DET	DT	PronType=Neg	4	det	_	_
+4	clue	clue	NOUN	NN	Number=Sing	2	obj	_	SpaceAfter=No
+5	.	.	PUNCT	.	_	2	punct	_	_</code></pre>
+
+<h2>Words, Tokens and Empty Nodes</h2>
+
+<pre><code>1-2	vámonos	_
+1	vamos	ir
+2	nos	nosotros
+3-4	al	_
+3	a	a
+4	el	el
+5	mar	mar</code></pre>
+
+<pre><code>1	Sue	Sue
+2	likes	like
+3	coffee	coffee
+4	and	and
+5	Bill	Bill
+5.1	likes	like
+6	tea	tea</code></pre>
+
+<pre><code>1	nosotros	nosotros
+2	vamos	ir
+3-4	al	_
+3	a	a
+4	el	el
+5	mar	mar
+6	y	y
+7	vosotros	vosotros
+7.1	vais	ir
+8-9	al	_
+8	a	a
+9	el	el
+10	parque	parque</code></pre>
+
+<h2>Morphological Annotation</h2>
+
+<pre><code>1	Då	då	ADV	AB	_
+2	var	vara	VERB	VB.PRET.ACT	Tense=Past|Voice=Act
+3	han	han	PRON	PN.UTR.SIN.DEF.NOM	Case=Nom|Definite=Def|Gender=Com|Number=Sing
+4	elva	elva	NUM	RG.NOM	Case=Nom|NumType=Card
+5	år	år	NOUN	NN.NEU.PLU.IND.NOM	Case=Nom|Definite=Ind|Gender=Neut|Number=Plur
+6	.	.	PUNCT	DL.MAD	_</code></pre>
+
+<h2>Syntactic Annotation</h2>
+
+<pre><code>1	They	they	PRON	PRP	Case=Nom|Number=Plur	2	nsubj	2:nsubj|4:nsubj
+2	buy	buy	VERB	VBP	Number=Plur|Person=3|Tense=Pres	0	root	0:root
+3	and	and	CCONJ	CC	_	4	cc	4:cc
+4	sell	sell	VERB	VBP	Number=Plur|Person=3|Tense=Pres	2	conj	0:root|2:conj
+5	books	book	NOUN	NNS	Number=Plur	2	obj	2:obj|4:obj
+6	.	.	PUNCT	.	_	2	punct	2:punct</code></pre>
+
+<h2>Untokenized Text</h2>
+
+<pre><code># text = Er arbeitet fürs FBI (deutsch etwa: „Bundesamt für Ermittlung“).
+# text_en = He works for the FBI (German approx: “Bundesamt für Ermittlung”).
+1	Er	er	PRON	…	_
+2	arbeitet	arbeiten	VERB	…	_
+3-4	fürs		_	_	…	_
+3	für	für	ADP	…	_
+4	das	der	DET	…	_
+5	FBI	FBI	PROPN	…	_
+6	(	(	PUNCT	…	SpaceAfter=No
+7	deutsch	deutsch	ADV	…	_
+8	etwa		etwa		ADV	…	SpaceAfter=No
+9	:	:	PUNCT	…	_
+10	„	„	PUNCT	…	SpaceAfter=No
+11	Bundesamt	Bundesamt	NOUN	…	_
+12	für	für	ADP	…	_
+13	Ermittlung	Ermittlung	NOUN	…	SpaceAfter=No
+14	“	“	PUNCT	…	SpaceAfter=No
+15	)	)	PUNCT	…	SpaceAfter=No
+16	.	.	PUNCT	…	_</code></pre>
+
+<h2>Sentence Boundaries and Comments</h2>
+
+<pre><code># sent_id = 1
+# text = They buy and sell books.
+1	They	they	PRON	PRP	Case=Nom|Number=Plur	2	nsubj	2:nsubj|4:nsubj	_
+2	buy	buy	VERB	VBP	Number=Plur|Person=3|Tense=Pres	0	root	0:root	_
+3	and	and	CCONJ	CC	_	4	cc	4:cc	_
+4	sell	sell	VERB	VBP	Number=Plur|Person=3|Tense=Pres	2	conj	0:root|2:conj	_
+5	books	book	NOUN	NNS	Number=Plur	2	obj	2:obj|4:obj	SpaceAfter=No
+6	.	.	PUNCT	.	_	2	punct	2:punct	_
+
+# sent_id = 2
+# text = I have no clue.
+1	I	I	PRON	PRP	Case=Nom|Number=Sing|Person=1	2	nsubj	_	_
+2	have	have	VERB	VBP	Number=Sing|Person=1|Tense=Pres	0	root	_	_
+3	no	no	DET	DT	PronType=Neg	4	det	_	_
+4	clue	clue	NOUN	NN	Number=Sing	2	obj	_	SpaceAfter=No
+5	.	.	PUNCT	.	_	2	punct	_	_
+
+# sent_id = panc0.s4
+# text = तत् यथानुश्रूयते।
+# translit = tat yathānuśrūyate.
+# text_fr = Voilà ce qui nous est parvenu par la tradition orale.
+# text_en = This is what is heard.
+1	तत्	तद्	DET	_	Case=Nom|…|PronType=Dem	3	nsubj	_	Translit=tat|LTranslit=tad|Gloss=it
+2-3	यथानुश्रूयते	_	_	_	_	_	_	_	SpaceAfter=No
+2	यथा	यथा	ADV	_	PronType=Rel	3	advmod	_	Translit=yathā|LTranslit=yathā|Gloss=how
+3	अनुश्रूयते	अनु-श्रु	VERB	_	Mood=Ind|…|Voice=Pass	0	root	_	Translit=anuśrūyate|LTranslit=anu-śru|Gloss=it-is-heard
+4	।	।	PUNCT	_	_	3	punct	_	Translit=.|LTranslit=.|Gloss=.</code></pre>
diff --git a/tests/languages/conllu/index_feature.test b/tests/languages/conllu/index_feature.test
@@ -0,0 +1,97 @@
+1-2	vámonos	_
+1	vamos	ir
+2	nos	nosotros
+3-4	al	_
+3	a	a
+4	el	el
+5	mar	mar
+
+1	Sue	Sue
+2	likes	like
+3	coffee	coffee
+4	and	and
+5	Bill	Bill
+5.1	likes	like
+6	tea	tea
+
+----------------------------------------------------
+
+[
+	["token", [
+		["id", "1-2"],
+		["form", "vámonos"],
+		["lemma", "_"]
+	]],
+	["token", [
+		["id", "1"],
+		["form", "vamos"],
+		["lemma", "ir"]
+	]],
+	["token", [
+		["id", "2"],
+		["form", "nos"],
+		["lemma", "nosotros"]
+	]],
+	["token", [
+		["id", "3-4"],
+		["form", "al"],
+		["lemma", "_"]
+	]],
+	["token", [
+		["id", "3"],
+		["form", "a"],
+		["lemma", "a"]
+	]],
+	["token", [
+		["id", "4"],
+		["form", "el"],
+		["lemma", "el"]
+	]],
+	["token", [
+		["id", "5"],
+		["form", "mar"],
+		["lemma", "mar"]
+	]],
+	["sentence-separator", ""],
+	["token", [
+		["id", "1"],
+		["form", "Sue"],
+		["lemma", "Sue"]
+	]],
+	["token", [
+		["id", "2"],
+		["form", "likes"],
+		["lemma", "like"]
+	]],
+	["token", [
+		["id", "3"],
+		["form", "coffee"],
+		["lemma", "coffee"]
+	]],
+	["token", [
+		["id", "4"],
+		["form", "and"],
+		["lemma", "and"]
+	]],
+	["token", [
+		["id", "5"],
+		["form", "Bill"],
+		["lemma", "Bill"]
+	]],
+	["token", [
+		["id", "5.1"],
+		["form", "likes"],
+		["lemma", "like"]
+	]],
+	["token", [
+		["id", "6"],
+		["form", "tea"],
+		["lemma", "tea"]
+	]]
+]
+
+----------------------------------------------------
+
+Testing indexing schemes.
+
+https://universaldependencies.org/format.html