#!/usr/bin/env perl -w

# Set up Perl infrastructure
use 5.014;
use utf8;
use strict;
use warnings;
use warnings FATAL => 'utf8';
use Encode qw[encode_utf8];

my $License = <<'_________';
## LICENSE

This software is Copyright (c) 2020  by Benct Philip Jonsson.

This is free software, licensed under:
                                       The MIT (X11) License
                                     <http://www.opensource.org/licenses/mit-license.php>
_________

my $Documentation = encode_utf8 <<'_________' . $License;

# build-dictionary.pl

The script assumes that there is a YAML file
'dictionary.yaml' in the current directory which contains
data in one of the forms described below in the current
directory.

The data will be sorted according to a customizable sort
order defined in the script (see SORTING below!) which must
be edited as desired.

Then the data is printed to a Pandoc Markdown pipe table in
'dictionary.md', one Hungarian—English word pair per row.

## DATA FORMAT

The script assumes that dictionary.yaml either is a list of
maps like this

    ---
    - hungarian: HUNGARIAN_WORD
      english:   ENGLISH_WORD
      remark:    REMARK
    ...
    
or a map of lists of maps like this

    ---
    HUNGARIAN_WORD:
      - english: ENGLISH_MEANING_1
        remark:  REMARK
      - english: ENGLISH_MEANING_2
        remark:  REMARK
    OTHER_HUNGARIAN_WORD:
      english: ENGLISH_WORD
      remark:  REMARK
    HUNGARIAN_SORT_KEY:
      - hungarian: HUNGARIAN_WORD_1
        english:   ENGLISH_MEANING_1
        remark:    REMARK
    ...
        
(however for convenience values in the top level map may be
a single map instead of a list of maps!)

Actually all of the fields (hungarian, english, remark) are
optional; they will be filled in with an empty string if
missing!

## SORTING

The script sorts entries according to a custom sort order
defined with the Sort::ArbBiLex module.

<https://metacpan.org/pod/Sort::ArbBiLex>

### IMPORTANT NOTE

Look at the `use Sort::ArbBiLex` statement in the script and
adjust the sort order declaration as you want it!

Briefly put the sort order declaration is a string where
each line is a "letter" of the your alphabet (actually a
"family" of letters) and each "word" in a line is a
"variant" of that letter.  Letters on the same line will
form one group, sorted among themselves according to their
order on the line, and each group is sorted relative to the
other groups according to the order of the lines. The result
is a single list, not a list of lists!

Note that you must define *all* letters in your alphabet,
including lines for punctuation characters and digits if you
want to sort them. Characters in the data which are not
included in the sort order declaration are completely
ignored!

As a contrieved example assume a sort order declaration like this:

    A a Á á
    C c
    CS Cs cs
    E e É é
    O o Ó ó
    Ö ö Ő ő
    S s
    SZ Sz sz SSZ ssz
    T t
    TY Ty ty TTY tty
    U u Ú ú

Here the lines represent the "primary" sort order and the
letters in each line represent the "secondary" sort order.
Letters in the same line will count as "the same" and
letters on different lines will count as different. Thus
"sz" regardless of case sorts after "s" regardless of case
but if two or more words beginning in "sz" differ only in
case "SZ Sz sc" will sort after each other according to
case. Since short and long vowels are on the same line they
are sorted as "identical" and will be "mixed" but since the
long vowels come after the short vowels if two words differ
only in the length of a vowel the word with the short vowel
will come before the word with the long vowel. On the other
hand since "o"-like vowels and "ö"-like vowels are on
different lines they will not be "mixed" with each other.

Now suppose that we have a list with these "words" (well
some of them *are* words but the point is to show how they
will be sorted):

    TYE Tyu ÓSZÉ ŐSÉ ócsa osu ca Sza su cse óce őte ősé csa
    ócu Tye szé ÓCA tyé otyu Sé őssé sa otté ota ocu TÉ őssz
    TE ÓCSA Szé tya ősze ce oty Tyé tye őty OSSÉ ötye ősza
    ötté sze OS ose ottya ÓCSE tu őtu őssa ősszu cu óttyé SU
    ŐSSZA őce TYÉ ocsu öte oss OSU ósszé ószé óca öttyé Te
    ÓSSZÉ ŐSZA ócse OCU OCSU őcse sé ÓCE ÖSSZ őtté ótté
    ÓSSZE őtyu Su ÓSA oté otye SE otya

Sorted with the above rudimentary sort order they
will come in this order:

    ca ce cu csa cse ÓCA óca ÓCE óce OCU ocu ócu ÓCSA ócsa
    ÓCSE ócse OCSU ocsu OS ÓSA ose oss OSSÉ OSU osu ÓSZÉ
    ÓSSZE ÓSSZÉ ószé ósszé ota oté otté ótté ottya óttyé oty
    otya otye otyu őce őcse ŐSÉ ősé őssa őssé ÖSSZ őssz ŐSZA
    ŐSSZA ősza ősze ősszu öte őte ötté őtté öttyé őtu őty
    ötye őtyu sa SE Sé sé SU Su su Sza Szé sze szé TE Te TÉ
    tu tya TYE TYÉ Tye Tyé tye tyé Tyu

Now suppose that we change the sort order so that long and
short vowels come on different lines; the resulting sorted
word list changes significantly:


    ca ce cu csa cse OCU ocu OCSU ocsu OS ose oss OSSÉ OSU
    osu ota oté otté ottya oty otya otye otyu ÓCA óca ÓCE
    óce ócu ÓCSA ócsa ÓCSE ócse ÓSA ÓSSZE ÓSZÉ ÓSSZÉ ószé
    ósszé ótté óttyé ÖSSZ öte ötté öttyé ötye őce őcse ŐSÉ
    ősé őssa őssé őssz ŐSZA ŐSSZA ősza ősze ősszu őte őtté
    őtu őty őtyu sa SE Sé sé SU Su su Sza sze Szé szé TE Te
    TÉ tu tya TYE Tye tye TYÉ Tyé tyé Tyu

## OUTPUT

The script writes a file 'dictionary.md' (which will be
overwritten if it already exists! Make sure to back up!)
This file will contain a Pandoc Markdown pipe table in the
form

    | Hungarian | English | Remark
    |:-----|:-----|:----------
    | WORD_1 | MEANING_1 | REMARK_1
    | WORD_2 | MEANING_2 | REMARK_2

The Markdown table isn't going to be pretty but it is going
to work!

## DEPENDENCIES

At least perl version 5.14. 

Version 5.10.1 or greater may work if you change the
statement `use 5.014;` to `use 5.010001;`.

The following modules must be installed from CPAN (see
<https://www.cpan.org/modules/INSTALL.html> for how!)

-   YAML::PP
-   Sort::ArbBiLex
-   Path::Tiny

_________


if ( grep { /^--man$/ } @ARGV ) {
    print($Documentation), exit;
}
elsif ( grep { /^(?:-h|--help)$/ } @ARGV ) {
    $Documentation =~ s{^\#\#.*}{Try --man for full documentation!\n\n$License\n}ms;
    print($Documentation), exit;
}

# The following modules must be installed from CPAN
# See https://www.cpan.org/modules/INSTALL.html

use YAML::PP;
use Path::Tiny qw[path];

# Define Hungarian sort order
# See https://metacpan.org/pod/Sort::ArbBiLex
# Only the first few letters and the last shown,
# please replace ... with the rest as appropriate!
use Sort::ArbBiLex 'hungarian_sort', q(
  A a Á á
  B b
  C c CZ Cz cz
  CS Cs cs
  ...
  ZS Zs zs ZZS zzs
);

# Instantiate YAML parser
my $ypp = YAML::PP->new(
    schema => ['JSON'],
    boolean => 'JSON::PP',
);

# Read in YAML data
my $data = $ypp->load_file('dictionary.yaml');

# Do this if $data is a list of maps
if ( 'ARRAY' eq ref $data ) {
    my %keyed_data;
    for my $item ( @$data ) {
        my $key   = $item->{hungarian};
        # The value for each key is a list of maps
        my $items = $keyed_data{$key} //= []; # / for poor editor!
        push @$items, $item;
    }
    $data = \%keyed_data; # sic!
}

# Sort the keys in Hungarian order
my @keys = hungarian_sort(keys %$data);

# The rows of the Markdown table are collected here
# The table isn't going to be pretty but it is going to work!
my @rows;

push @rows, 
    # Headers
    "| Hungarian | English | Remark\n",
    # Column specifier
    "|:-----|:-----|:----------\n";

for my $key ( @keys ) {
    my $items = $data->{$key};
    # Make sure we have a list of maps
    # Handle null value
    $items //= [];
    unless ( 'ARRAY' eq ref $items ) {
        $items = [$items];
    }
    for my $item ( @$items ) {
        unless ( 'HASH' eq ref $item ) {
            die "Expected dictionary.yaml to be list of maps or map of (lists of) maps (try --help)";
        }
        # Insert Hungarian word if undefined
        $item->{hungarian} //= $key; # / for poor editor!
        my($hun, $eng, $rem) = @{$item}{qw(hungarian english remark)};
        # Fix up values for table
        for my $val ( $hun, $eng, $rem ) {
            # Handle undefined (null) value
            $val //= "";
            # Fix whitespace in value:
            # replace all sequences of whitespace 
            # with a single space because 
            # pipe tables must be one line per row
            $val =~ s<\s+><\x{20}>g;
            # Escape pipes in value
            $val =~ s{\|}{&#124;}g;
        }
        # Add table row
        push @rows, "| $hun | $eng | $rem\n";
    }
}

# Dump table rows to markdown file
path('dictionary.md')->spew_utf8(\@rows);

# Done!