#!/usr/bin/env perl -w # Set up Perl infrastructure use 5.014; use utf8; use strict; use warnings; use warnings FATAL => 'utf8'; use Encode qw[encode_utf8]; my $License = <<'_________'; ## LICENSE This software is Copyright (c) 2020 by Benct Philip Jonsson. This is free software, licensed under: The MIT (X11) License _________ my $Documentation = encode_utf8 <<'_________' . $License; # build-dictionary.pl The script assumes that there is a YAML file 'dictionary.yaml' in the current directory which contains data in one of the forms described below in the current directory. The data will be sorted according to a customizable sort order defined in the script (see SORTING below!) which must be edited as desired. Then the data is printed to a Pandoc Markdown pipe table in 'dictionary.md', one Hungarian—English word pair per row. ## DATA FORMAT The script assumes that dictionary.yaml either is a list of maps like this --- - hungarian: HUNGARIAN_WORD english: ENGLISH_WORD remark: REMARK ... or a map of lists of maps like this --- HUNGARIAN_WORD: - english: ENGLISH_MEANING_1 remark: REMARK - english: ENGLISH_MEANING_2 remark: REMARK OTHER_HUNGARIAN_WORD: english: ENGLISH_WORD remark: REMARK HUNGARIAN_SORT_KEY: - hungarian: HUNGARIAN_WORD_1 english: ENGLISH_MEANING_1 remark: REMARK ... (however for convenience values in the top level map may be a single map instead of a list of maps!) Actually all of the fields (hungarian, english, remark) are optional; they will be filled in with an empty string if missing! ## SORTING The script sorts entries according to a custom sort order defined with the Sort::ArbBiLex module. ### IMPORTANT NOTE Look at the `use Sort::ArbBiLex` statement in the script and adjust the sort order declaration as you want it! Briefly put the sort order declaration is a string where each line is a "letter" of the your alphabet (actually a "family" of letters) and each "word" in a line is a "variant" of that letter. Letters on the same line will form one group, sorted among themselves according to their order on the line, and each group is sorted relative to the other groups according to the order of the lines. The result is a single list, not a list of lists! Note that you must define *all* letters in your alphabet, including lines for punctuation characters and digits if you want to sort them. Characters in the data which are not included in the sort order declaration are completely ignored! As a contrieved example assume a sort order declaration like this: A a Á á C c CS Cs cs E e É é O o Ó ó Ö ö Ő ő S s SZ Sz sz SSZ ssz T t TY Ty ty TTY tty U u Ú ú Here the lines represent the "primary" sort order and the letters in each line represent the "secondary" sort order. Letters in the same line will count as "the same" and letters on different lines will count as different. Thus "sz" regardless of case sorts after "s" regardless of case but if two or more words beginning in "sz" differ only in case "SZ Sz sc" will sort after each other according to case. Since short and long vowels are on the same line they are sorted as "identical" and will be "mixed" but since the long vowels come after the short vowels if two words differ only in the length of a vowel the word with the short vowel will come before the word with the long vowel. On the other hand since "o"-like vowels and "ö"-like vowels are on different lines they will not be "mixed" with each other. Now suppose that we have a list with these "words" (well some of them *are* words but the point is to show how they will be sorted): TYE Tyu ÓSZÉ ŐSÉ ócsa osu ca Sza su cse óce őte ősé csa ócu Tye szé ÓCA tyé otyu Sé őssé sa otté ota ocu TÉ őssz TE ÓCSA Szé tya ősze ce oty Tyé tye őty OSSÉ ötye ősza ötté sze OS ose ottya ÓCSE tu őtu őssa ősszu cu óttyé SU ŐSSZA őce TYÉ ocsu öte oss OSU ósszé ószé óca öttyé Te ÓSSZÉ ŐSZA ócse OCU OCSU őcse sé ÓCE ÖSSZ őtté ótté ÓSSZE őtyu Su ÓSA oté otye SE otya Sorted with the above rudimentary sort order they will come in this order: ca ce cu csa cse ÓCA óca ÓCE óce OCU ocu ócu ÓCSA ócsa ÓCSE ócse OCSU ocsu OS ÓSA ose oss OSSÉ OSU osu ÓSZÉ ÓSSZE ÓSSZÉ ószé ósszé ota oté otté ótté ottya óttyé oty otya otye otyu őce őcse ŐSÉ ősé őssa őssé ÖSSZ őssz ŐSZA ŐSSZA ősza ősze ősszu öte őte ötté őtté öttyé őtu őty ötye őtyu sa SE Sé sé SU Su su Sza Szé sze szé TE Te TÉ tu tya TYE TYÉ Tye Tyé tye tyé Tyu Now suppose that we change the sort order so that long and short vowels come on different lines; the resulting sorted word list changes significantly: ca ce cu csa cse OCU ocu OCSU ocsu OS ose oss OSSÉ OSU osu ota oté otté ottya oty otya otye otyu ÓCA óca ÓCE óce ócu ÓCSA ócsa ÓCSE ócse ÓSA ÓSSZE ÓSZÉ ÓSSZÉ ószé ósszé ótté óttyé ÖSSZ öte ötté öttyé ötye őce őcse ŐSÉ ősé őssa őssé őssz ŐSZA ŐSSZA ősza ősze ősszu őte őtté őtu őty őtyu sa SE Sé sé SU Su su Sza sze Szé szé TE Te TÉ tu tya TYE Tye tye TYÉ Tyé tyé Tyu ## OUTPUT The script writes a file 'dictionary.md' (which will be overwritten if it already exists! Make sure to back up!) This file will contain a Pandoc Markdown pipe table in the form | Hungarian | English | Remark |:-----|:-----|:---------- | WORD_1 | MEANING_1 | REMARK_1 | WORD_2 | MEANING_2 | REMARK_2 The Markdown table isn't going to be pretty but it is going to work! ## DEPENDENCIES At least perl version 5.14. Version 5.10.1 or greater may work if you change the statement `use 5.014;` to `use 5.010001;`. The following modules must be installed from CPAN (see for how!) - YAML::PP - Sort::ArbBiLex - Path::Tiny _________ if ( grep { /^--man$/ } @ARGV ) { print($Documentation), exit; } elsif ( grep { /^(?:-h|--help)$/ } @ARGV ) { $Documentation =~ s{^\#\#.*}{Try --man for full documentation!\n\n$License\n}ms; print($Documentation), exit; } # The following modules must be installed from CPAN # See https://www.cpan.org/modules/INSTALL.html use YAML::PP; use Path::Tiny qw[path]; # Define Hungarian sort order # See https://metacpan.org/pod/Sort::ArbBiLex # Only the first few letters and the last shown, # please replace ... with the rest as appropriate! use Sort::ArbBiLex 'hungarian_sort', q( A a Á á B b C c CZ Cz cz CS Cs cs ... ZS Zs zs ZZS zzs ); # Instantiate YAML parser my $ypp = YAML::PP->new( schema => ['JSON'], boolean => 'JSON::PP', ); # Read in YAML data my $data = $ypp->load_file('dictionary.yaml'); # Do this if $data is a list of maps if ( 'ARRAY' eq ref $data ) { my %keyed_data; for my $item ( @$data ) { my $key = $item->{hungarian}; # The value for each key is a list of maps my $items = $keyed_data{$key} //= []; # / for poor editor! push @$items, $item; } $data = \%keyed_data; # sic! } # Sort the keys in Hungarian order my @keys = hungarian_sort(keys %$data); # The rows of the Markdown table are collected here # The table isn't going to be pretty but it is going to work! my @rows; push @rows, # Headers "| Hungarian | English | Remark\n", # Column specifier "|:-----|:-----|:----------\n"; for my $key ( @keys ) { my $items = $data->{$key}; # Make sure we have a list of maps # Handle null value $items //= []; unless ( 'ARRAY' eq ref $items ) { $items = [$items]; } for my $item ( @$items ) { unless ( 'HASH' eq ref $item ) { die "Expected dictionary.yaml to be list of maps or map of (lists of) maps (try --help)"; } # Insert Hungarian word if undefined $item->{hungarian} //= $key; # / for poor editor! my($hun, $eng, $rem) = @{$item}{qw(hungarian english remark)}; # Fix up values for table for my $val ( $hun, $eng, $rem ) { # Handle undefined (null) value $val //= ""; # Fix whitespace in value: # replace all sequences of whitespace # with a single space because # pipe tables must be one line per row $val =~ s<\s+><\x{20}>g; # Escape pipes in value $val =~ s{\|}{|}g; } # Add table row push @rows, "| $hun | $eng | $rem\n"; } } # Dump table rows to markdown file path('dictionary.md')->spew_utf8(\@rows); # Done!