polish sorting

From: Philipp Gesang <pgesang@ix.urz.uni-heidelberg.de>
To: ntg-context@ntg.nl
Subject: polish sorting
Date: Wed, 18 Aug 2010 18:08:56 +0200	[thread overview]
Message-ID: <20100818160856.GB13324@aides> (raw)

[-- Attachment #1.1.1: Type: text/plain, Size: 904 bytes --]

Hi,

I'm creating some sorting tables. While researching this topic I
stumbled on the Polish dictionary sorting rules: if two strings are
equal except for case then the one gets precedence that begins
lowercase.[1] (This seems to apply to the Swedish order as well but I
have no means to verify that. Apparently, my German dictionary (from
1991) follows the same rule without explicitly stating so.)

Context seems to prefer it the other way round, so I modified two
functions from sort-ini.lua to handle that; but I'm not happy with
this solution.

So my question: is there already, or could we have some mechanism
to influence the details of sorting in context?

Thanks for your help, 
Philipp

[1] <ftp://ftp.gust.org.pl/pub/GUST/bulletin/03/02-bl.pdf>, p. 7.

-- 
()  ascii ribbon campaign - against html e-mail
/\  www.asciiribbon.org   - against proprietary attachments

[-- Attachment #1.1.2: playground.lua --]
[-- Type: text/plain, Size: 2080 bytes --]

--- testing environment for sorters
dofile "polishsort.lua"

      document.whatever = { }
document.whatever.words = { }

local my = {}
function my.gsub (s, patt, repl)
  patt = lpeg.S(patt)
  patt = lpeg.Cs((patt / repl + 1)^0)
  return lpeg.match(patt, s)
end

--- based on http://www.mail-archive.com/ntg-context@ntg.nl/msg47525.html

function document.whatever.sorttext()
    local dwtext = document.whatever.text
    --local split = sorters.splitters.utf
    local split = sorters.splitters.utflower
    dwtext = my.gsub(dwtext, '\n\t\v"', " ")
    dwtext = string.explode(dwtext, " +")
    local dwwords = document.whatever.words
    for i=1, #dwtext do
        local current = string.strip(dwtext[i])
        if current ~= "" then
            table.insert(dwwords, { word = current })
        end
    end

    for i=1, #dwwords do
        local word = dwwords[i]
        word.split = split(word.word) 
    end
    --sorters.sort(dwwords, sorters.comparers.basic)
    sorters.sort(dwwords, sorters.comparers.polish)
end

function document.whatever.flushtext()
    local words = document.whatever.words
    local previous = false
    local p_word = false
    for i=1, #words do
        local word = words[i]
        local letter, current = sorters.firstofsplit(word)
        local letter = utf.lower(letter)
        if previous ~= current then
            previous = current
            context.section(letter)
        end

        local c_word = word.word

        if p_word ~= c_word then
            context(tostring(i) .. ": " .. c_word) context.par()
            p_word = c_word
        end
    end
end

function testrun (lang)
    --f = assert(io.open("anna-utf.txt", "r"))
    --f = assert(io.open("sltext.txt", "r"))
    document.whatever.text = [[
polskie słowa dziwnie się szereguje
Polskie Słowa Dziwnie Się Szereguje
]]

    sorters.setlanguage(lang)
    context.starttext()
    document.whatever.sorttext()
    document.whatever.flushtext()
    context.stoptext()
end

testrun("pl")

[-- Attachment #1.1.3: polishsort.lua --]
[-- Type: text/plain, Size: 4997 bytes --]

--- Polish sorting (including the letters q, v, x)

sorters.replacements["pl"] = {}

sorters.entries["pl"] = {
    ["a"]   = "a", ["ą"]   = "ą", ["b"]   = "b", ["c"]   = "c", ["ć"]   = "ć",
    ["d"]   = "d", ["e"]   = "e", ["ę"]   = "ę", ["f"]   = "f", ["g"]   = "g",
    ["h"]   = "h", ["i"]   = "i", ["j"]   = "j", ["k"]   = "k", ["l"]   = "l",
    ["ł"]   = "ł", ["m"]   = "m", ["n"]   = "n", ["ń"]   = "ń", ["o"]   = "o",
    ["ó"]   = "ó", ["p"]   = "p", ["q"]   = "q", ["r"]   = "r", ["s"]   = "s",
    ["ś"]   = "ś", ["t"]   = "t", ["u"]   = "u", ["v"]   = "v", ["w"]   = "w",
    ["x"]   = "x", ["y"]   = "y", ["z"]   = "z", ["ź"]   = "ź", ["ż"]   = "ż",
}

sorters.mappings["pl"] = {
    ["a"]   =  1, ["ą"]   =  2, ["b"]   =  3, ["c"]   =  4, ["ć"]   =  5,
    ["d"]   =  6, ["e"]   =  7, ["ę"]   =  8, ["f"]   =  9, ["g"]   = 10,
    ["h"]   = 11, ["i"]   = 12, ["j"]   = 13, ["k"]   = 14, ["l"]   = 15,
    ["ł"]   = 16, ["m"]   = 17, ["n"]   = 18, ["ń"]   = 19, ["o"]   = 20,
    ["ó"]   = 21, ["p"]   = 22, ["q"]   = 23, ["r"]   = 24, ["s"]   = 25,
    ["ś"]   = 26, ["t"]   = 27, ["u"]   = 28, ["v"]   = 29, ["w"]   = 30,
    ["x"]   = 31, ["y"]   = 32, ["z"]   = 33, ["ź"]   = 34, ["ż"]   = 35,
}

local  currentreplacements = sorters.replacements["pl"] or {}
local      currentmappings = sorters.mappings["pl"] or {}
local       currententries = sorters.entries["pl"] or {}

local        utfcharacters = string.utfcharacters
local              utfbyte = utf.byte

-- unchanged, needs to be in local scope
local function basicsort(sort_a,sort_b)
    if not sort_a or not sort_b then
        return 0
    elseif #sort_a > #sort_b then
        if #sort_b == 0 then
            return 1
        else
            for i=1,#sort_b do
                local ai, bi = sort_a[i], sort_b[i]
                if ai > bi then
                    return  1
                elseif ai < bi then
                    return -1
                end
            end
            return 1
        end
    elseif #sort_a < #sort_b then
        if #sort_a == 0 then
            return -1
        else
            for i=1,#sort_a do
                local ai, bi = sort_a[i], sort_b[i]
                if ai > bi then
                    return  1
                elseif ai < bi then
                    return -1
                end
            end
            return -1
        end
    elseif #sort_a == 0 then
        return 0
    else
        for i=1,#sort_a do
            local ai, bi = sort_a[i], sort_b[i]
            if ai > bi then
                return  1
            elseif ai < bi then
                return -1
            end
        end
        return 0
    end
end

-- modified from sorters.comparers.basic(str)
function sorters.comparers.polish(a,b)
    local ea, eb = a.split, b.split
    local na, nb = #ea, #eb
    if na == 0 and nb == 0 then
        -- simple variant (single word)
        local result = basicsort(ea.e,eb.e)
        if result == 0 then
            if eb.first_lower and not ea.first_lower then
                return 1
            elseif ea.first_lower and not eb.first_lower then
                return -1
            else
                return 0
            end
        else
            return basicsort(ea.m, eb.m)
        end
    else
        -- complex variant, used in register (multiple words)
        local result = 0
        for i=1,nb < na and nb or na do
            local eai, ebi = ea[i], eb[i]
            result = basicsort(eai.e,ebi.e)
            if result == 0 then
                result = basicsort(eai.m,ebi.m) -- only needed it there are m's
            end
            if result ~= 0 then
                break
            end
        end
        if result ~= 0 then
            return result
        elseif na > nb then
            return 1
        elseif nb > na then
            return -1
        else
            if eb[1].first_lower and not ea[1].first_lower then
                return 1
            elseif ea[1].first_lower and not eb[1].first_lower then
                return -1
            else
                return 0
            end
        end
    end
end

-- modified from sorters.splitters.utf(str)
function sorters.splitters.utflower(str)
    local first_char = utf.sub(str,1,1)
    str = utf.lower(str)
    if #currentreplacements > 0 then
        for k=1,#currentreplacements do
            local v = currentreplacements[k]
            str = gsub(str,v[1],v[2])
        end
    end
    local s, e, m, n = { }, { }, { }, 0
    for sc in utfcharacters(str) do -- maybe an lpeg
        local ec, mc = currententries[sc], currentmappings[sc] or utfbyte(sc)
        n = n + 1
        s[n] = sc
        e[n] = currentmappings[ec] or mc
        m[n] = mc
    end
    return { s = s, e = e, m = m, first_lower = first_char == utf.lower(first_char) }
end

[-- Attachment #1.2: Type: application/pgp-signature, Size: 198 bytes --]

[-- Attachment #2: Type: text/plain, Size: 486 bytes --]

___________________________________________________________________________________
If your question is of interest to others as well, please add an entry to the Wiki!

maillist : ntg-context@ntg.nl / http://www.ntg.nl/mailman/listinfo/ntg-context
webpage  : http://www.pragma-ade.nl / http://tex.aanhet.net
archive  : http://foundry.supelec.fr/projects/contextrev/
wiki     : http://contextgarden.net
___________________________________________________________________________________