From: Philipp Gesang <pgesang@ix.urz.uni-heidelberg.de>
To: ntg-context@ntg.nl
Subject: polish sorting
Date: Wed, 18 Aug 2010 18:08:56 +0200 [thread overview]
Message-ID: <20100818160856.GB13324@aides> (raw)
[-- Attachment #1.1.1: Type: text/plain, Size: 904 bytes --]
Hi,
I'm creating some sorting tables. While researching this topic I
stumbled on the Polish dictionary sorting rules: if two strings are
equal except for case then the one gets precedence that begins
lowercase.[1] (This seems to apply to the Swedish order as well but I
have no means to verify that. Apparently, my German dictionary (from
1991) follows the same rule without explicitly stating so.)
Context seems to prefer it the other way round, so I modified two
functions from sort-ini.lua to handle that; but I'm not happy with
this solution.
So my question: is there already, or could we have some mechanism
to influence the details of sorting in context?
Thanks for your help,
Philipp
[1] <ftp://ftp.gust.org.pl/pub/GUST/bulletin/03/02-bl.pdf>, p. 7.
--
() ascii ribbon campaign - against html e-mail
/\ www.asciiribbon.org - against proprietary attachments
[-- Attachment #1.1.2: playground.lua --]
[-- Type: text/plain, Size: 2080 bytes --]
--- testing environment for sorters
dofile "polishsort.lua"
document.whatever = { }
document.whatever.words = { }
local my = {}
function my.gsub (s, patt, repl)
patt = lpeg.S(patt)
patt = lpeg.Cs((patt / repl + 1)^0)
return lpeg.match(patt, s)
end
--- based on http://www.mail-archive.com/ntg-context@ntg.nl/msg47525.html
function document.whatever.sorttext()
local dwtext = document.whatever.text
--local split = sorters.splitters.utf
local split = sorters.splitters.utflower
dwtext = my.gsub(dwtext, '\n\t\v"', " ")
dwtext = string.explode(dwtext, " +")
local dwwords = document.whatever.words
for i=1, #dwtext do
local current = string.strip(dwtext[i])
if current ~= "" then
table.insert(dwwords, { word = current })
end
end
for i=1, #dwwords do
local word = dwwords[i]
word.split = split(word.word)
end
--sorters.sort(dwwords, sorters.comparers.basic)
sorters.sort(dwwords, sorters.comparers.polish)
end
function document.whatever.flushtext()
local words = document.whatever.words
local previous = false
local p_word = false
for i=1, #words do
local word = words[i]
local letter, current = sorters.firstofsplit(word)
local letter = utf.lower(letter)
if previous ~= current then
previous = current
context.section(letter)
end
local c_word = word.word
if p_word ~= c_word then
context(tostring(i) .. ": " .. c_word) context.par()
p_word = c_word
end
end
end
function testrun (lang)
--f = assert(io.open("anna-utf.txt", "r"))
--f = assert(io.open("sltext.txt", "r"))
document.whatever.text = [[
polskie słowa dziwnie się szereguje
Polskie Słowa Dziwnie Się Szereguje
]]
sorters.setlanguage(lang)
context.starttext()
document.whatever.sorttext()
document.whatever.flushtext()
context.stoptext()
end
testrun("pl")
[-- Attachment #1.1.3: polishsort.lua --]
[-- Type: text/plain, Size: 4997 bytes --]
--- Polish sorting (including the letters q, v, x)
sorters.replacements["pl"] = {}
sorters.entries["pl"] = {
["a"] = "a", ["ą"] = "ą", ["b"] = "b", ["c"] = "c", ["ć"] = "ć",
["d"] = "d", ["e"] = "e", ["ę"] = "ę", ["f"] = "f", ["g"] = "g",
["h"] = "h", ["i"] = "i", ["j"] = "j", ["k"] = "k", ["l"] = "l",
["ł"] = "ł", ["m"] = "m", ["n"] = "n", ["ń"] = "ń", ["o"] = "o",
["ó"] = "ó", ["p"] = "p", ["q"] = "q", ["r"] = "r", ["s"] = "s",
["ś"] = "ś", ["t"] = "t", ["u"] = "u", ["v"] = "v", ["w"] = "w",
["x"] = "x", ["y"] = "y", ["z"] = "z", ["ź"] = "ź", ["ż"] = "ż",
}
sorters.mappings["pl"] = {
["a"] = 1, ["ą"] = 2, ["b"] = 3, ["c"] = 4, ["ć"] = 5,
["d"] = 6, ["e"] = 7, ["ę"] = 8, ["f"] = 9, ["g"] = 10,
["h"] = 11, ["i"] = 12, ["j"] = 13, ["k"] = 14, ["l"] = 15,
["ł"] = 16, ["m"] = 17, ["n"] = 18, ["ń"] = 19, ["o"] = 20,
["ó"] = 21, ["p"] = 22, ["q"] = 23, ["r"] = 24, ["s"] = 25,
["ś"] = 26, ["t"] = 27, ["u"] = 28, ["v"] = 29, ["w"] = 30,
["x"] = 31, ["y"] = 32, ["z"] = 33, ["ź"] = 34, ["ż"] = 35,
}
local currentreplacements = sorters.replacements["pl"] or {}
local currentmappings = sorters.mappings["pl"] or {}
local currententries = sorters.entries["pl"] or {}
local utfcharacters = string.utfcharacters
local utfbyte = utf.byte
-- unchanged, needs to be in local scope
local function basicsort(sort_a,sort_b)
if not sort_a or not sort_b then
return 0
elseif #sort_a > #sort_b then
if #sort_b == 0 then
return 1
else
for i=1,#sort_b do
local ai, bi = sort_a[i], sort_b[i]
if ai > bi then
return 1
elseif ai < bi then
return -1
end
end
return 1
end
elseif #sort_a < #sort_b then
if #sort_a == 0 then
return -1
else
for i=1,#sort_a do
local ai, bi = sort_a[i], sort_b[i]
if ai > bi then
return 1
elseif ai < bi then
return -1
end
end
return -1
end
elseif #sort_a == 0 then
return 0
else
for i=1,#sort_a do
local ai, bi = sort_a[i], sort_b[i]
if ai > bi then
return 1
elseif ai < bi then
return -1
end
end
return 0
end
end
-- modified from sorters.comparers.basic(str)
function sorters.comparers.polish(a,b)
local ea, eb = a.split, b.split
local na, nb = #ea, #eb
if na == 0 and nb == 0 then
-- simple variant (single word)
local result = basicsort(ea.e,eb.e)
if result == 0 then
if eb.first_lower and not ea.first_lower then
return 1
elseif ea.first_lower and not eb.first_lower then
return -1
else
return 0
end
else
return basicsort(ea.m, eb.m)
end
else
-- complex variant, used in register (multiple words)
local result = 0
for i=1,nb < na and nb or na do
local eai, ebi = ea[i], eb[i]
result = basicsort(eai.e,ebi.e)
if result == 0 then
result = basicsort(eai.m,ebi.m) -- only needed it there are m's
end
if result ~= 0 then
break
end
end
if result ~= 0 then
return result
elseif na > nb then
return 1
elseif nb > na then
return -1
else
if eb[1].first_lower and not ea[1].first_lower then
return 1
elseif ea[1].first_lower and not eb[1].first_lower then
return -1
else
return 0
end
end
end
end
-- modified from sorters.splitters.utf(str)
function sorters.splitters.utflower(str)
local first_char = utf.sub(str,1,1)
str = utf.lower(str)
if #currentreplacements > 0 then
for k=1,#currentreplacements do
local v = currentreplacements[k]
str = gsub(str,v[1],v[2])
end
end
local s, e, m, n = { }, { }, { }, 0
for sc in utfcharacters(str) do -- maybe an lpeg
local ec, mc = currententries[sc], currentmappings[sc] or utfbyte(sc)
n = n + 1
s[n] = sc
e[n] = currentmappings[ec] or mc
m[n] = mc
end
return { s = s, e = e, m = m, first_lower = first_char == utf.lower(first_char) }
end
[-- Attachment #1.2: Type: application/pgp-signature, Size: 198 bytes --]
[-- Attachment #2: Type: text/plain, Size: 486 bytes --]
___________________________________________________________________________________
If your question is of interest to others as well, please add an entry to the Wiki!
maillist : ntg-context@ntg.nl / http://www.ntg.nl/mailman/listinfo/ntg-context
webpage : http://www.pragma-ade.nl / http://tex.aanhet.net
archive : http://foundry.supelec.fr/projects/contextrev/
wiki : http://contextgarden.net
___________________________________________________________________________________
next reply other threads:[~2010-08-18 16:08 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-08-18 16:08 Philipp Gesang [this message]
2010-08-18 22:38 ` Hans Hagen
2010-08-18 22:48 ` Hans Hagen
2010-08-19 8:13 ` Philipp Gesang
2010-08-19 9:41 ` Hans Hagen
2010-08-19 10:35 ` Philipp Gesang
2010-08-19 11:13 ` Hans Hagen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20100818160856.GB13324@aides \
--to=pgesang@ix.urz.uni-heidelberg.de \
--cc=ntg-context@ntg.nl \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).