From mboxrd@z Thu Jan 1 00:00:00 1970 X-Msuck: nntp://news.gmane.io/gmane.text.pandoc/24869 Path: news.gmane.io!.POSTED.ciao.gmane.io!not-for-mail From: Benct Philip Jonsson Newsgroups: gmane.text.pandoc Subject: Re: A New Feature for Pandoc's Markdown Extension -- No Space with Newline Date: Wed, 15 Apr 2020 17:23:25 +0200 Message-ID: <09a13fd8-db26-b851-b426-2fa7ad96ecf4@gmail.com> References: <20130715175101.GA20541@protagoras.phil.berkeley.edu> <51E56808.5000500@gmail.com> <35356bdb-9f45-4f0c-bb49-3fb4e2db98a0@googlegroups.com> <1beb6ec0-19a5-4da7-b785-ebb7d340c865@googlegroups.com> <5fe78fc8-7050-4342-8d5e-1350b9b06794@googlegroups.com> Reply-To: pandoc-discuss-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="------------B02BC0D378A0EC2B077BDF19" Injection-Info: ciao.gmane.io; posting-host="ciao.gmane.io:159.69.161.202"; logging-data="81058"; mail-complaints-to="usenet@ciao.gmane.io" User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Thunderbird/68.4.1 To: pandoc-discuss-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org, J Original-X-From: pandoc-discuss+bncBCWMVYEK54FRB46N3T2AKGQENSYBFNQ-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org Wed Apr 15 17:23:36 2020 Return-path: Envelope-to: gtp-pandoc-discuss@m.gmane-mx.org Original-Received: from mail-lj1-f188.google.com ([209.85.208.188]) by ciao.gmane.io with esmtps (TLS1.3:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.92) (envelope-from ) id 1jOjt0-000Kso-9x for gtp-pandoc-discuss@m.gmane-mx.org; Wed, 15 Apr 2020 17:23:34 +0200 Original-Received: by mail-lj1-f188.google.com with SMTP id j2sf695104ljo.22 for ; Wed, 15 Apr 2020 08:23:34 -0700 (PDT) ARC-Seal: i=2; a=rsa-sha256; t=1586964214; cv=pass; d=google.com; s=arc-20160816; b=PgOXCvuFAX989V7s7afm2IwMq73uHDb66+CuurGLqEWa8DiuYIpbbxki1mhaJ2XFKQ IY/DVrHsnMos4IOVsVYrOs9ZV7ZYAzOnbWEr+y4vUbhTtU/s7yRO7DCy69Krr2xGvOd4 ryh3yendOPW+cvzADCyrBqALIueo6KOeWx8WTrUUqkZ9fZ5NjH+e/AHBTTxPuVBdZyav iN4gCqWv25Mz8ptCH/2uJe2c8avKDb5gSJMyPqYA0AzCFbZGrwKTGCbYfJ7vWl6XCUKe MNttl24fk+OMurCpkaSizCGIUzLfGkZdN510DZZULJPrNtKsJH9ANc53t0YJBUcLnhAQ wrkQ== ARC-Message-Signature: i=2; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=list-unsubscribe:list-subscribe:list-archive:list-help:list-post :list-id:mailing-list:precedence:reply-to:content-language :in-reply-to:mime-version:user-agent:date:message-id:from:references :to:subject:sender:dkim-signature:dkim-signature; bh=aWR1/dv5h5+UuTJvJVZnsnO0WPBvsdmm7oHwF4BRqGk=; b=FHDTFuvNncgjTw5SnxAyAdotjGVSgxTcBT7+/bzkgKhuHqxosxr0pwLZ/+aSdRgB83 w4VaUxAxUPxu64lYKluhU9BoS6OvwVtvkK9l2vuRwZk6RYhd6N980zDq9ewlrkhlhmW5 rkBrbhShVEM7n4Z2gxiFygT2pFw70kjB0py7+aZvQtoXp7PhrQJvmSPcaPoKPHL2f5oT V8x+9k20ZEdjqU7FPIQVR/wCBD5p+8vQ6+nR+/sSgN+MPkbsM/EuIwNhgTljciyH8wQd oNI759DvdupyMNhAEEHW5G/ArFegK+RrWhmz6fL9ylWOl5fe4bSrXxHc/oOKqlmOHFkz 7o/g== ARC-Authentication-Results: i=2; gmr-mx.google.com; dkim=pass header.i=@gmail.com header.s=20161025 header.b=RfKzKkZD; spf=pass (google.com: domain of melroch-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org designates 2a00:1450:4864:20::131 as permitted sender) smtp.mailfrom=melroch-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org; dmarc=pass (p=NONE sp=QUARANTINE dis=NONE) header.from=gmail.com DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=googlegroups.com; s=20161025; h=sender:subject:to:references:from:message-id:date:user-agent :mime-version:in-reply-to:content-language:x-original-sender :x-original-authentication-results:reply-to:precedence:mailing-list :list-id:list-post:list-help:list-archive:list-subscribe :list-unsubscribe; bh=aWR1/dv5h5+UuTJvJVZnsnO0WPBvsdmm7oHwF4BRqGk=; b=GbMfOxJXLcHKbCL2Wt+1z5cahzg55hPhCGr5sJr7ysI2adndEMkFReMmnNxogYW1zE EqSf6EfJYAdMWU+KCZw9TimDCpSCnoPM1/kQl9xvhF5zJA4LwKtpO7p6KpTcuJ3EXbU7 VeAa8kJHKWsSJNfpRHnG8ikxYPMmhkwVqbjFN18hMfSoZVNzmB0kGRb4BTbq6SsNsRLy L3mRj/fUhggy4KFUQyGNl4eTbCrLtGQx1BY8rg0qNXj3XqMq6ipYC7c2HU1Nb2mqSFtx bCtiAdc7jiGeLCsEAtSxZAyIgG3pE05AqfkaU3VLrWe93R7yqNpKnv+NrpwEMxsj10Zn Fhtw== DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20161025; h=subject:to:references:from:message-id:date:user-agent:mime-version :in-reply-to:content-language:x-original-sender :x-original-authentication-results:reply-to:precedence:mailing-list :list-id:list-post:list-help:list-archive:list-subscribe :list-unsubscribe; bh=aWR1/dv5h5+UuTJvJVZnsnO0WPBvsdmm7oHwF4BRqGk=; b=hCSMYlLkpYtKnsRcPWqJRSXKBl9GtxWESbgF/0NSt06bVtTYKO/StXnY6hIkUhuey/ a6EXJ5IVqFNeqcjBEY2aFp9+iBdoXq06B/h0yDnOrHy6oYxcoe+XNaWrNjFCVj3rvEoo spHTlclycMGEP9YMecy9KP7g985b5a5VRNc+kdA+TYlqaNHqhaI2emqh8fzCXwVUyndL f1ayLrgvhxuu6GdKooGGvtq91lTeghDVlKF3NsXfbUjDMaa74OJBMYEfZSd6Ztpt3R0X Uge0eFWw2sypSsP/VYM8jymHuzQNPvMT3ce3yA6fX+/ZiEvQdiOxS4texK7Afp+1H+Pe NClQ== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=sender:x-gm-message-state:subject:to:references:from:message-id :date:user-agent:mime-version:in-reply-to:content-language :x-original-sender:x-original-authentication-results:reply-to :precedence:mailing-list:list-id:x-spam-checked-in-group:list-post :list-help:list-archive:list-subscribe:list-unsubscribe; bh=aWR1/dv5h5+UuTJvJVZnsnO0WPBvsdmm7oHwF4BRqGk=; b=aRTjtz4+lbAK6ahMnApm8eYaLobsr4DdViz6DoJBuk9duKA9vhOcAJbcII+REKcgf2 7VQuaGMzCOJc71a6HP1cQkN4t7fzz56vB3mVOBbCRyjtunEo3MR8cC9S6EsoiYGvL8Lu NzT8M6fjDmoY49+PEqMLcKZ5DMfLF+95SCWs0xOM1u8T9ASgu44GAgIkuVaZ/bcuZD/Q eMrmU8KltTicD0Bfjc404Lqbq5CgvCmm4fN29vN8c0+w3IdO3HuBSdLFrHPCd99jhnbX RmNZKu3dqtjsE9RNBgOdAlW6JfEPYwhAxW6J4TtQVy4MLnbvR8LNYYbwA2fYpkxkHFh2 Original-Sender: pandoc-discuss-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org X-Gm-Message-State: AGi0PubIYnqulMkVDXJBPuQ6R55eerm34JL0Rh2g7E3JbDDMxiFfKDL2 u8l3BzIKlg/wM+USb26vBp4= X-Google-Smtp-Source: APiQypLzwxprag7uKbVVwF+lFhqwjF4fd4pXrKvGUg7JYRyR1BWndMit+1a0FSPzVbLwGtXFxI8l9g== X-Received: by 2002:a19:c64b:: with SMTP id w72mr3361913lff.82.1586964213732; Wed, 15 Apr 2020 08:23:33 -0700 (PDT) X-BeenThere: pandoc-discuss-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org Original-Received: by 2002:a2e:a550:: with SMTP id e16ls1469226ljn.6.gmail; Wed, 15 Apr 2020 08:23:31 -0700 (PDT) X-Received: by 2002:a2e:5841:: with SMTP id x1mr3625970ljd.86.1586964210029; Wed, 15 Apr 2020 08:23:30 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1586964210; cv=none; d=google.com; s=arc-20160816; b=E8LTvVLcsyWE7RAhC+YqYSSH23L3jB9UOsIklS7rGzqsacW+pYCe4V0/1xdadCO8Fj Eemylm+KAfP5i75MI7Y485HjA2VeZAj0bH+Im9Ak/OXzMFcjWNYqkbMAfpXVo8+TPIYR hkV2a+1clu4QbWobGCIXm86j6CccuSN2Xxi2bVqG3XI/igzuIT15XeGv7WO++KtnuDYa Volw2P1QI4bsNKaENJntzcGI6yhAJR2+CpSYXvXSYgEOMVYO5/zXYvql/ErqwivLgtOy zNNP1CA3LtOhN+UDMev5wVHzhvTJL3qW0tSVYZ4heLyEuApo/2jviuGdR1Iz1tZxYeXf qgmw== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=content-language:in-reply-to:mime-version:user-agent:date :message-id:from:references:to:subject:dkim-signature; bh=08MeAsF7OFCYWwH7Bo+PkALJDMxUYGVyASRirTG3gy8=; b=Jv3dpuInCtKt2cqW59qNAYcu2V1OUhro89e9yF3q1d/m6tGh9outnnGbvTjvSM90Mo TI36oQqI1a8u1vkVAZpDTQJD7X4RvkSWwp0qXmgmrVqXxJ1eVMudQEj+GKIiWztGKc9z gf2n5POdUbC5xEcIsYZxHE0uuCL0QpayH2CW5kMirRQZf2aq45XEQYnItd1UZidL0yLN qFtj9+ZtwRv6pONE1aWSFHYm1Jgdkulfwts2GVF1dZWmUOvRH+9iImHhWb9A2VekBm9b 2DV2YFUINnKPpZ9cmvf6Zg2figorL9zX0SxK4SxHgQt46KMEExxDvEe0Rw5Lc+dpPZYK sQjQ== ARC-Authentication-Results: i=1; gmr-mx.google.com; dkim=pass header.i=@gmail.com header.s=20161025 header.b=RfKzKkZD; spf=pass (google.com: domain of melroch-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org designates 2a00:1450:4864:20::131 as permitted sender) smtp.mailfrom=melroch-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org; dmarc=pass (p=NONE sp=QUARANTINE dis=NONE) header.from=gmail.com Original-Received: from mail-lf1-x131.google.com (mail-lf1-x131.google.com. [2a00:1450:4864:20::131]) by gmr-mx.google.com with ESMTPS id a12si151014ljm.2.2020.04.15.08.23.30 for (version=TLS1_3 cipher=TLS_AES_128_GCM_SHA256 bits=128/128); Wed, 15 Apr 2020 08:23:30 -0700 (PDT) Received-SPF: pass (google.com: domain of melroch-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org designates 2a00:1450:4864:20::131 as permitted sender) client-ip=2a00:1450:4864:20::131; Original-Received: by mail-lf1-x131.google.com with SMTP id 198so2938836lfo.7 for ; Wed, 15 Apr 2020 08:23:29 -0700 (PDT) X-Received: by 2002:ac2:4466:: with SMTP id y6mr3417777lfl.125.1586964209555; Wed, 15 Apr 2020 08:23:29 -0700 (PDT) Original-Received: from [192.168.1.122] (user141.77-105-220.netatonce.net. [77.105.220.141]) by smtp.gmail.com with ESMTPSA id t12sm12911299lfq.71.2020.04.15.08.23.26 (version=TLS1_3 cipher=TLS_AES_128_GCM_SHA256 bits=128/128); Wed, 15 Apr 2020 08:23:26 -0700 (PDT) In-Reply-To: <5fe78fc8-7050-4342-8d5e-1350b9b06794-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org> Content-Language: en-US X-Original-Sender: melroch-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org X-Original-Authentication-Results: gmr-mx.google.com; dkim=pass header.i=@gmail.com header.s=20161025 header.b=RfKzKkZD; spf=pass (google.com: domain of melroch-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org designates 2a00:1450:4864:20::131 as permitted sender) smtp.mailfrom=melroch-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org; dmarc=pass (p=NONE sp=QUARANTINE dis=NONE) header.from=gmail.com Precedence: list Mailing-list: list pandoc-discuss-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org; contact pandoc-discuss+owners-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org List-ID: X-Google-Group-Id: 1007024079513 List-Post: , List-Help: , List-Archive: , List-Unsubscribe: , Xref: news.gmane.io gmane.text.pandoc:24869 Archived-At: This is a multi-part message in MIME format. --------------B02BC0D378A0EC2B077BDF19 Content-Type: text/plain; charset="UTF-8"; format=flowed Content-Transfer-Encoding: quoted-printable Perl/JSON filter attached. Take care not to overwrite your original files as this is barely tested=20 on a single line of text with mixed Hanzi/Latin letters. Usage instructions and installation hints in the file (Below=20 $DOCUMENTATION). On 2020-04-15 07:57, J wrote: > Please don't worry about CPAN. Google will help and I am willing to try t= he > steps needed. :D >=20 > On Wednesday, April 15, 2020 at 12:07:47 AM UTC+8, BPJ wrote: >> >> Are you conversant with perl and CPAN? >> If not what operating system(s) do you use (Windows/Mac/Linux)? >> >> I ask because if the answer to the first question is no I may have to >> guide you through installing some stuff, including perl itself if the >> answer to the second question is Windows. >> >> Den tis 14 apr. 2020 16:13J > skrev: >> >>> Thank sounds perfect ! Many thanks for your efforts ! >>> >>> On Tuesday, April 14, 2020 at 1:18:17 PM UTC+8, BP wrote: >>>> >>>> A Perl filter which removes Space and SoftBreak elements sandwiched >>>> between two Str elements which respectively ends and starts with a >>>> character with Unicode script property CJK is certainly doable. Will t= hat >>>> be OK? >>>> >>>> /BPJ >>>> >>>> >>>> Den tis 14 apr. 2020 02:39J skrev: >>>> >>>>> Thank you for your efforts very much ! I wonder if the script can kee= p >>>>> the spaces inside English words, digits, and punctuation, since my fi= les >>>>> also contain short groups of English words and number with digits ? >>>>> >>>>> On Tuesday, April 14, 2020 at 3:16:40 AM UTC+8, BP wrote: >>>>>> >>>>>> Wow that script is really ancient! I'll try to port it to a Lua filt= er >>>>>> tomorrow. It's 9 PM here now and I have been coding or writing for t= welve >>>>>> hours, so I'm quite exhausted. >>>>>> >>>>>> Just to be clear, the old script removes all spaces which are next t= o >>>>>> a "string" element, i.e. all "words", digits and punctuation alike, = and not >>>>>> just CJK characters. If you are OK with that behavior porting it to = a Lua >>>>>> filter will be trivial, and Lua is built-in in Pandoc. Otherwise I'l= l have >>>>>> to look into rewriting the Perl script, which may be not quite as tr= ivial. >>>>>> >>>>>> /BPJ >>>>>> >>>>>> Den m=C3=A5n 13 apr. 2020 20:45J skrev: >>>>>> >>>>>>> Could you help to update zapspace.pl to work with pandoc 2.9.2.1 ? = I >>>>>>> have Chinese markdown files that use spaces to separate groups of w= ords, >>>>>>> and would like to ignore spaces between Chinese characters before >>>>>>> converting to Word. >>>>>>> Many thanks ! >>>>>>> >>>>>>> On Tuesday, July 16, 2013 at 11:34:32 PM UTC+8, BP Jonsson wrote: >>>>>>>> >>>>>>>> 2013-07-15 19:51, John MacFarlane skrev: >>>>>>>>> +++ Bill Chen (CHEN, Zhechuan) [Jul 15 13 17:16 ]: >>>>>>>>>> Have found a way to make this feature done. >>>>>>>>>> Just add "\n" at the last of the line >>>>>>>>> >>>>>>>>> This would violate the general rule that backslashes before >>>>>>>> letters in >>>>>>>>> markdown are just literal backslashes. >>>>>>>>> >>>>>>>>> I think that a better approach would be to provide a markdown >>>>>>>>> extension like the current 'hard_line_breaks': perhaps >>>>>>>>> 'ignore_line_breaks'. 'hard_line_breaks' causes line >>>>>>>>> breaks in a paragraph to be interpreted as hard breaks; >>>>>>>>> 'ignore_line_breaks' would cause them to be ignored entirely. >>>>>>>>> (One of these would have to be designated as taking precedence >>>>>>>>> if both were selected.) >>>>>>>>> >>>>>>>>> John >>>>>>>>> >>>>>>>> >>>>>>>> The attached perl script, when used as a filter on pandoc's >>>>>>>> json output, should enable Bill to get what he wants. I have >>>>>>>> used an earlier version on Tibetan text with satisfactory >>>>>>>> results. Someone who knows Haskell could probably write >>>>>>>> something shorter which interacts with pandoc in a more >>>>>>>> elegant way, but this script works. >>>>>>>> >>>>>>>> The description inside the file reads as follows: >>>>>>>> >>>>>>>> FILE: zapspace.pl >>>>>>>> >>>>>>>> USAGE: pandoc -w json some.markdown | zapspace.pl | pandoc >>>>>>>> -r json >>>>>>>> >>>>>>>> DESCRIPTION: Takes as input a document in pandoc's json format a= nd >>>>>>>> removes all "Space" elements inside any list which >>>>>>>> also >>>>>>>> contains any {"Str":"..."} element, and outputs a >>>>>>>> modified json document, which when given as input t= o >>>>>>>> pandoc will produce output suitable for languages >>>>>>>> which >>>>>>>> don't put spaces between words or sentences, with n= o >>>>>>>> spaces >>>>>>>> inside paragraphs -- unless you insert non-breaking >>>>>>>> spaces, >>>>>>>> see below! --, and notably spaces caused by linebre= aks >>>>>>>> in the markdown paragraph will be removed. >>>>>>>> >>>>>>>> Additionally it does two things which allow you to >>>>>>>> insert whitespace inside paragraph-like elements: >>>>>>>> >>>>>>>> 1) It replaces any non-breaking space (U+00A0) ins= ide >>>>>>>> a >>>>>>>> "Str" element with ordinary soft spaces (U+0020= ) >>>>>>>> *if* the "Str" element also contains characters >>>>>>>> other >>>>>>>> than non-breaking spaces. >>>>>>>> >>>>>>>> This allows you to insert spaces into your >>>>>>>> markdown >>>>>>>> paragraphs as non-breaking spaces (in pandoc >>>>>>>> notation >>>>>>>> a backslash followed by an ordinary space "like= \ >>>>>>>> this") >>>>>>>> and get ordinary spaces in your output. >>>>>>>> >>>>>>>> 2) Preserves any "Str" element which only contains >>>>>>>> one >>>>>>>> or more non-breaking spaces as is. >>>>>>>> >>>>>>>> This allows you to put non-breaking spaces betw= een >>>>>>>> words by inserting ordinary whitespace -- which >>>>>>>> will >>>>>>>> be removed -- on either side of the non-breakin= g >>>>>>>> spaces "like \ this". >>>>>>>> ^ ^ >>>>>>>> >>>>>>>> N.B. that this is *not* done by scanning the JSON t= ext >>>>>>>> with regular expressions! The JSON is loaded into = a >>>>>>>> perl data structure which is modified and then >>>>>>>> converted >>>>>>>> back into JSON. Precautions are taken not to modify >>>>>>>> the >>>>>>>> structure such that the output will be rejected by >>>>>>>> pandoc, nor to modify code elements, but I can't >>>>>>>> guarantee >>>>>>>> that this will remain true with future versions of >>>>>>>> pandoc, >>>>>>>> or that it is true for any input. >>>>>>>> >>>>>>>> OPTIONS: --- >>>>>>>> REQUIREMENTS: * A reasonably recent version of perl. >>>>>>>> * The following CPAN modules: >>>>>>>> >>>>>>>> - [JSON::Any]( >>>>>>>> https://metacpan.org/module/JSON::Any) >>>>>>>> + A JSON 'backend' module like JSON or >>>>>>>> JSON::XS. >>>>>>>> - [List::MoreUtils]( >>>>>>>> https://metacpan.org/module/List::MoreUtils) >>>>>>>> - [autovivification]( >>>>>>>> https://metacpan.org/module/autovivification) >>>>>>>> >>>>>>>> >>>>>>>> >>>>>>>> --=20 >>>>>>> You received this message because you are subscribed to the Google >>>>>>> Groups "pandoc-discuss" group. >>>>>>> To unsubscribe from this group and stop receiving emails from it, >>>>>>> send an email to pandoc-...-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org >>>>>>> To view this discussion on the web visit >>>>>>> https://groups.google.com/d/msgid/pandoc-discuss/35356bdb-9f45-4f0c= -bb49-3fb4e2db98a0%40googlegroups.com >>>>>>> >>>>>>> . >>>>>>> >>>>>> --=20 >>>>> You received this message because you are subscribed to the Google >>>>> Groups "pandoc-discuss" group. >>>>> To unsubscribe from this group and stop receiving emails from it, sen= d >>>>> an email to pandoc-...-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org >>>>> To view this discussion on the web visit >>>>> https://groups.google.com/d/msgid/pandoc-discuss/1beb6ec0-19a5-4da7-b= 785-ebb7d340c865%40googlegroups.com >>>>> >>>>> . >>>>> >>>> --=20 >>> You received this message because you are subscribed to the Google Grou= ps >>> "pandoc-discuss" group. >>> To unsubscribe from this group and stop receiving emails from it, send = an >>> email to pandoc-...-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org . >>> To view this discussion on the web visit >>> https://groups.google.com/d/msgid/pandoc-discuss/b3c84390-28d9-4962-909= a-43eceab09108%40googlegroups.com >>> >>> . >>> >> >=20 --=20 You received this message because you are subscribed to the Google Groups "= pandoc-discuss" group. To unsubscribe from this group and stop receiving emails from it, send an e= mail to pandoc-discuss+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org To view this discussion on the web visit https://groups.google.com/d/msgid/= pandoc-discuss/09a13fd8-db26-b851-b426-2fa7ad96ecf4%40gmail.com. --------------B02BC0D378A0EC2B077BDF19 Content-Type: application/x-perl; name="zapspace-cjk.pl" Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="zapspace-cjk.pl" #!/usr/bin/env perl use 5.014; use utf8; use strict; use warnings; use warnings FATAL => 'utf8'; use autodie; my $DOCUMENTATION = <<'------'; # NAME zapspace-cjk.pl - Pandoc filter which removes whitespace between East Asian characters. # SYNOPSIS pandoc -F zapspace-cjk.pl [OPTIONS] -o OUTPUTFILE INPUTFILE... # REQUIREMENTS - perl >= 5.14 - JSON (Perl module!) - Data::Rmap # INSTALLATION - - Run the following commands: cpan App::cpanminus cpanm Perl::PrereqScanner scan-perl-prereqs zapspace-cjk.pl | cpanm # RUNNING PANDOC FILTERS # PRINTING THE DOCUMENTATION perl zapspace-cjk.pl | pandoc ... # WARNINGS Not tested since I don't have any suitable data myself! Take care not to overwrite your original files! # AUTHOR Benct Philip Jonsson # COPYRIGHT AND LICENSE This software is Copyright (c) 2020 by Benct Philip Jonsson. This is free software, licensed under: The MIT (X11) License ------ say($DOCUMENTATION), exit unless @ARGV; use JSON; use Data::Rmap qw[ rmap_array ]; # Apparently there is no single Unicode property which # covers everything East Asian, # and we have to consider combining marks too. # I hope I got everything which may reasonably occur in modern texts covered here! my $cjk_re = qr{ (?= [\p{Han}\p{Hangul}\p{Katakana}\p{Hiragana}\p{Bopomofo}\p{In_CJK_Symbols}] ) \X }msx; my $cjk_end = qr{ $cjk_re $ }msx; my $cjk_start = qr{ ^ $cjk_re }msx; my $tag_re = qr{ \A (?: [A-Z]+ [a-z]+ )+ \z }msx; sub is_elem { my $e = shift; return unless 'HASH' eq ref $e; return unless exists $e->{t} and defined $e->{t} and !ref $e->{t} and length $e->{t} and $e->{t} =~ $tag_re; return if 1 < keys %$e and !exists $e->{c}; my $t = $e->{t}; # Do we want some specific tag(s)? if ( @_ ) { for my $tag ( @_ ) { $tag //= ""; next unless $tag eq $t; return $t; } return 0; # Defined but false } return $t; # Any tag is OK } my $json = JSON->new->utf8->canonical; my $format = shift @ARGV; my $data = do { local $/; <>; }; my $ast = $json->decode($data); rmap_array { my $list = $_; my @rv; my $last = $#$list; for my $i ( 0 .. $last ) { my $e = $list->[$i]; my $p = $i > 0 ? $list->[$i-1] : undef; my $n = $i < $last ? $list->[$i+1] : undef; $p = undef unless is_elem $p, 'Str'; $n = undef unless is_elem $n, 'Str'; if ( is_elem $e, qw[ Space SoftBreak ] ) { if ( $p && $n ) { next if $p->{c} =~ $cjk_end and $n->{c} =~ $cjk_start; } } push @rv, $e; } $_ = \@rv if @rv; # If not empty } $ast; print $json->encode($ast); --------------B02BC0D378A0EC2B077BDF19--