Корисник:Kephir/Unicode

Below are scripts I use for maintaining the Unicode database at Module:Unicode data.

How to use scripts on this page

уреди

Ingredients: GNU Make, GNU awk, GNU wget.

  1. Put the scripts below in a dedicated directory.
  2. Run make clean, then make.
  3. The Unicode database will be downloaded, and several .lua files will be generated from it. Save them in appropriate locations as specified in the table below.
  4. Update the Unicode version number wherever you find it (e.g. Module:character list)
  5. Done. Throw a drinking party.
File name Wiktionary page
aliases.lua Модул:Unicode data/aliases
blocks.lua the blocks table in Module:Unicode data
combining.lua Модул:Unicode data/combining
control.lua Модул:Unicode data/control (includes Cx and Zx)
names_XXX.lua Module:Unicode data/names/XXX

Скрипте

уреди
Makefile
.PHONY: all names clean update

all: blocks.lua control.lua aliases.lua combining.lua names

clean:
	rm -f *.lua *.txt

names: names.awk UnicodeData.txt
	gawk -F';' -f $^

update: all
	./upload

%.lua: %.awk
	gawk -F';' -f $^ >'$@'

Derived%.txt:
	wget 'http://unicode.org/Public/UNIDATA/extracted/$@' -O '$@'

%.txt:
	wget 'http://unicode.org/Public/UNIDATA/$@' -O '$@'

blocks.lua: Blocks.txt

control.lua: DerivedGeneralCategory.txt

aliases.lua: NameAliases.txt

combining.lua: DerivedCombiningClass.txt

names_%.lua: names
aliases.awk
/^[0-9A-F]/ {
	codepoint = strtonum("0x" $1)
	codepoints[codepoint][length(codepoints[codepoint])] = $2";"$3
}

END {
	print "local correction, control, alternate, figment, abbreviation = "
	print "\t\"correction\", \"control\", \"alternate\", \"figment\", \"abbreviation\""
	print ""
	print "return {"
	for (cpoint = 0; cpoint < 0x10ffff; ++cpoint) {
		if (!(cpoint in codepoints))
			continue
		printf "\t[0x%06x] = {\n", cpoint
		for (i in codepoints[cpoint]) {
			split(codepoints[cpoint][i], fields, ";")
			printf "\t\t{ %12s, \"%s\" };\n", fields[2], fields[1]
		}
		printf "\t};\n"
	}
	print "}"
}
names.awk
BEGIN {
	last = -1
}

/^[0-9A-Fa-f]/ && !($2 ~ /^</) {
	cp = strtonum("0x"$1)
	page = sprintf("%03X", int(cp / 0x1000))
	if (last != page) {
		if (last != -1)
			print "}" >> "names_" last ".lua"
		print "return {" > "names_" page ".lua"
		last = page
	}
	printf("\t[ 0x%04X ] = \"%s\",\n", strtonum("0x"$1), $2) >> "names_" page ".lua"
}

END {
	print "}" >> "names_" last ".lua"
}
blocks.awk
BEGIN {
	FS=" *(\\.\\.|;) *"
	print ("local blocks = {")
}

/^[0-9A-Fa-f]/ {
	printf("\t{ %-50s 0x%06X, 0x%06X },\n", "\"" $3 "\",", strtonum("0x"$1), strtonum("0x"$2))
}

END {
	print ("}")
}
combining.awk
BEGIN {
	FS="\\.\\.| *[;#] *"
}

!($1 ~ /^[0-9A-F]/) {
	next
}

($3 ~ /^[0-9]*$/) {
	if ($3 == "0")
		next
	ranges[strtonum("0x" $1)] = strtonum("0x" $2)
	kinds[strtonum("0x" $1)] = $3
	next
}

!($3 ~ /^[0-9]*$/) {
	if ($2 == "0")
		next
	singles[strtonum("0x" $1)] = $2
	next
}

END {
	print "return {"
	print "\tsingle = {"
	for (i = 0; i < 0x10ffff; ++i) {
		if (i in singles)
			printf("\t\t[0x%06X] = %4s\n", i, singles[i] ",")
	}
	print "\t};"
	print "\tranges = {"
	for (i = 0; i < 0x10ffff; ++i) {
		if (i in ranges)
			printf("\t\t{ 0x%06X, 0x%06X, %3s },\n", i, ranges[i], kinds[i])
	}
	print "\t};"
	print "}"
}
control.awk
BEGIN {
	FS="\\.\\.| *[;#] *"
}

!($1 ~ /^[0-9A-F]/) {
	next
}

($2 ~ /^[CZ][a-z]/) {
	singles[strtonum("0x" $1)] = $2
}

($3 ~ /^[CZ][a-z]/) {
	ranges[strtonum("0x" $1)] = strtonum("0x" $2)
	kinds[strtonum("0x" $1)] = $3
}

END {
	print "local Cc, Cf, Cs, Co, Cn ="
	print "\t\"control\", \"format\", \"surrogate\", \"private-use\", \"unassigned\""
	print "local Zs, Zl, Zp ="
	print "\t\"space-separator\", \"line-separator\", \"paragraph-separator\""
	print ""
	print "return {"
	print "\tsingle = {"
	for (i = 0; i < 0x10ffff; ++i) {
		if (singles[i])
			printf("\t\t[0x%06X] = %s,\n", i, singles[i])
	}
	print "\t};"
	print "\tranges = {"
	for (i = 0; i < 0x10ffff; ++i) {
		if (ranges[i])
			printf("\t\t{ 0x%06X, 0x%06X, %s },\n", i, ranges[i], kinds[i])
	}
	print "\t};"
	print "}"
}