#!/usr/bin/env perl # Try out sorting according to various locales with Unicode::Collate::Locale and normalization forms with Unicode::Normalize. # # Requires the following CPAN modules to be installed: # # utf8::all # # Unicode::Collate::Locale # # Unicode::Normalize # # Path::Tiny # # Getopt::Long::Descriptive # # See: # # # Also requires perl 5.10.1 or later. # # If you are on a Unixy system you probably have a new enough perl installed. # Otherwise see: # # # On Windows I would recommend Strawberry Perl. # # This software is copyright (c) 2021 by Benct Philip Jonsson. # # This is free software; you can redistribute it and/or modify it under # the same terms as the Perl 5 programming language system itself. # # http://dev.perl.org/licenses/ # use 5.010001; # use utf8; use utf8::all; use strict; use warnings; use warnings FATAL => 'utf8'; use autodie; # use open qw[ :utf8 :std ]; use Unicode::Collate::Locale; use Unicode::Normalize qw[normalize]; use Path::Tiny qw[path]; use Getopt::Long::Descriptive; my($opt,$usage) = describe_options( '%c %o', [ 'locale|l=s@', 'A locale to try like "fr" or "fr-CA". Repeatable.', +{ required => 1 }, ], [ 'normalize|n=s@', 'A Unicode Normalization Form according to Unicode::Normalize to apply like NFC or NFD. For unnormalized say -n 0 (zero). Repeatable. Default: NFC.', +{ default => ['NFC'] }, ], [ 'input|i=s', 'Name of text file with lines to sort. Assumed to be UTF-8 encoded.', + { required => 1 }, ], [ 'output|o=s', 'Name of output file to print to. Optional. Default: stdout.', ], [ 'help|h', 'Print help text and exit.', +{ shortcircuit => 1 }, ], +{ show_defaults => 0, getopt_conf => [qw(no_auto_abbrev no_bundling no_ignore_case)], }, ); if ( $opt->help ) { say "$0: try out sorting according to various locales with Unicode::Collate::Locale and normalization forms with Unicode::Normalize."; print $usage->text; exit; } my $locales = $opt->locale; my $norms = $opt->normalize; my $in = $opt->input; my $out = $opt->output; my $fh = $out ? path($out)->openw_utf8 : \*STDOUT; select $fh; my @lines = path($in)->lines_utf8; for my $locale ( @$locales ) { my $coll = Unicode::Collate::Locale->new(locale => $locale); printf "Locale: $locale; getlocale: %s\n\n", $coll->getlocale; for my $norm ( @$norms ) { print "Normalization: $norm\n\n"; my @normed = $norm ? (map { normalize $norm, $_ } @lines) : @lines; my @sorted = $coll->sort(@normed); print "Sorted:\n\n@sorted\n\n"; } } select STDOUT; close $fh; exit;