PHP 中的平衡自动换行(最小粗糙度)


Balanced word wrap (Minimum raggedness) in PHP

我将在PHP中制作一个自动换行算法。我想将小块文本(短短语(拆分为最多m 个字符的 n 行(没有给出 n,因此会根据需要有尽可能多的行(。特点是行长(以字符为单位(必须尽可能跨行平衡。

输入文本示例:

How to do things

错误的输出(这是正常的自动换行行为(,m=6

How to
do
things

所需输出,始终 m=6

How 
to do 
things

没有人对如何实现此功能有建议或指南?基本上,我正在搜索在两三行(尽可能多(等长行上漂亮的印刷短短语


更新:看来我正在搜索最小粗糙度自动换行算法。但是我找不到任何真正的编程语言实现(任何人,然后我可以在 PHP 中转换它(。


更新2:我为此开始了赏金。是否有可能在任何过程语言中不存在最小粗糙度算法的任何公共实现?我需要以可以转化为程序说明的方式编写一些东西。我现在能找到的只是一大堆(通用(方程,但是需要一个最佳的搜索过程。我还将感谢只能近似最佳搜索算法的实现。

我已经在与 Alex 相同的行上实现了,对维基百科算法进行编码,但直接使用 PHP(对我来说是一个有趣的练习(。了解如何使用最优成本函数f(j(,即"递归"部分,并不是很容易。感谢 Alex 提供注释良好的代码。

/** 
 * minimumRaggedness
 *
 * @param string $input paragraph. Each word separed by 1 space.
 * @param int $LineWidth the max chars per line.
 * @param string $lineBreak wrapped lines separator.
 * 
 * @return string $output the paragraph wrapped.
 */
function minimumRaggedness($input, $LineWidth, $lineBreak = "'n")
{
    $words = explode(" ", $input);
    $wsnum = count($words);
    $wslen = array_map("strlen", $words);
    $inf = 1000000; //PHP_INT_MAX;
    // keep Costs
    $C = array();
    for ($i = 0; $i < $wsnum; ++$i)
    {
        $C[] = array();
        for ($j = $i; $j < $wsnum; ++$j)
        {
            $l = 0;
            for ($k = $i; $k <= $j; ++$k)
                $l += $wslen[$k];
            $c = $LineWidth - ($j - $i) - $l;
            if ($c < 0)
                $c = $inf;
            else
                $c = $c * $c;
            $C[$i][$j] = $c;
        }
    }
    // apply recurrence
    $F = array();
    $W = array();
    for ($j = 0; $j < $wsnum; ++$j)
    {
        $F[$j] = $C[0][$j];
        $W[$j] = 0;
        if ($F[$j] == $inf)
        {
            for ($k = 0; $k < $j; ++$k)
            {
                $t = $F[$k] + $C[$k + 1][$j];
                if ($t < $F[$j])
                {
                    $F[$j] = $t;
                    $W[$j] = $k + 1;
                }
            }
        }
    }
    // rebuild wrapped paragraph
    $output = "";
    if ($F[$wsnum - 1] < $inf)
    {
        $S = array();
        $j = $wsnum - 1;
        for ( ; ; )
        {
            $S[] = $j;
            $S[] = $W[$j];
            if ($W[$j] == 0)
                break;
            $j = $W[$j] - 1;
        }
        $pS = count($S) - 1;
        do
        {
            $i = $S[$pS--];
            $j = $S[$pS--];
            for ($k = $i; $k < $j; $k++)
                $output .= $words[$k] . " ";
            $output .= $words[$k] . $lineBreak;
        }
        while ($j < $wsnum - 1);
    }
    else
        $output = $input;
    return $output;
}

?>

快速而肮脏,在 c++ 中

#include <sstream>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <memory.h>
using namespace std;
int cac[1000][1000];
string res[1000][1000];
vector<string> words;
int M;
int go(int a, int b){
    if(cac[a][b]>= 0) return cac[a][b];
    if(a == b) return 0;
    int csum = -1;
    for(int i=a; i<b; ++i){
    csum += words[i].size() + 1;
    }
    if(csum <= M || a == b-1){
    string sep = "";
        for(int i=a; i<b; ++i){
            res[a][b].append(sep);
            res[a][b].append(words[i]);
            sep = " ";
    }
    return cac[a][b] = (M-csum)*(M-csum);
    }
    int ret = 1000000000;
    int best_sp = -1;
    for(int sp=a+1; sp<b; ++sp){
    int cur = go(a, sp) + go(sp,b);
    if(cur <= ret){
        ret = cur;
        best_sp = sp;
    }
    }
    res[a][b] = res[a][best_sp] + "'n" + res[best_sp][b];
    return cac[a][b] = ret;
}

int main(int argc, char ** argv){
    memset(cac, -1, sizeof(cac));
    M = atoi(argv[1]);
    string word;
    while(cin >> word) words.push_back(word);
    go(0, words.size());
    cout << res[0][words.size()] << endl;
}

测试:

$ echo "The quick brown fox jumps over a lazy dog" |./a.out 10
The quick
brown fox
jumps over
a lazy dog

编辑:刚刚查看了维基百科页面,以获得最小的破烂自动换行。将算法更改为给定的算法(带有平方惩罚(

C 版本:

// This is a direct implementation of the minimum raggedness word wrapping
// algorithm from http://en.wikipedia.org/wiki/Word_wrap#Minimum_raggedness
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <limits.h>
const char* pText = "How to do things";
int LineWidth = 6;
int WordCnt;
const char** pWords;
int* pWordLengths;
int* pC;
int* pF;
int* pW;
int* pS;
int CountWords(const char* p)
{
  int cnt = 0;
  while (*p != ''0')
  {
    while (*p != ''0' && isspace(*p)) p++;
    if (*p != ''0')
    {
      cnt++;
      while (*p != ''0' && !isspace(*p)) p++;
    }
  }
  return cnt;
}
void FindWords(const char* p, int cnt, const char** pWords, int* pWordLengths)
{
  while (*p != ''0')
  {
    while (*p != ''0' && isspace(*p)) p++;
    if (*p != ''0')
    {
      *pWords++ = p;
      while (*p != ''0' && !isspace(*p)) p++;
      *pWordLengths++ = p - pWords[-1];
    }
  }
}
void PrintWord(const char* p, int l)
{
  int i;
  for (i = 0; i < l; i++)
    printf("%c", p[i]);
}
// 1st program's argument is the text
// 2nd program's argument is the line width
int main(int argc, char* argv[])
{
  int i, j;
  if (argc >= 3)
  {
    pText = argv[1];
    LineWidth = atoi(argv[2]);
  }
  WordCnt = CountWords(pText);
  pWords = malloc(WordCnt * sizeof(*pWords));
  pWordLengths = malloc(WordCnt * sizeof(*pWordLengths));
  FindWords(pText, WordCnt, pWords, pWordLengths);
  printf("Input Text: '"%s'"'n", pText);
  printf("Line Width: %d'n", LineWidth);
  printf("Words     : %d'n", WordCnt);
#if 0
  for (i = 0; i < WordCnt; i++)
  {
    printf("'"");
    PrintWord(pWords[i], pWordLengths[i]);
    printf("'"'n");
  }
#endif
  // Build c(i,j) in pC[]
  pC = malloc(WordCnt * WordCnt * sizeof(int));
  for (i = 0; i < WordCnt; i++)
  {
    for (j = 0; j < WordCnt; j++)
      if (j >= i)
      {
        int k;
        int c = LineWidth - (j - i);
        for (k = i; k <= j; k++) c -= pWordLengths[k];
        c = (c >= 0) ? c * c : INT_MAX;
        pC[j * WordCnt + i] = c;
      }
      else
        pC[j * WordCnt + i] = INT_MAX;
  }
  // Build f(j) in pF[] and store the wrap points in pW[]
  pF = malloc(WordCnt * sizeof(int));
  pW = malloc(WordCnt * sizeof(int));
  for (j = 0; j < WordCnt; j++)
  {
    pW[j] = 0;
    if ((pF[j] = pC[j * WordCnt]) == INT_MAX)
    {
      int k;
      for (k = 0; k < j; k++)
      {
        int s;
        if (pF[k] == INT_MAX || pC[j * WordCnt + k + 1] == INT_MAX)
          s = INT_MAX;
        else
          s = pF[k] + pC[j * WordCnt + k + 1];
        if (pF[j] > s)
        {
          pF[j] = s;
          pW[j] = k + 1;
        }
      }
    }
  }
  // Print the optimal solution cost
  printf("f         : %d'n", pF[WordCnt - 1]);
  // Print the optimal solution, if any
  pS = malloc(2 * WordCnt * sizeof(int));
  if (pF[WordCnt - 1] != INT_MAX)
  {
    // Work out the solution's words by back tracking the
    // wrap points from pW[] and store them on the pS[] stack
    j = WordCnt - 1;
    for (;;)
    {
      *pS++ = j;
      *pS++ = pW[j];
      if (!pW[j]) break;
      j = pW[j] - 1;
    }
    // Print the solution line by line, word by word
    // in direct order
    do
    {
      int k;
      i = *--pS;
      j = *--pS;
      for (k = i; k <= j; k++)
      {
        PrintWord(pWords[k], pWordLengths[k]);
        printf(" ");
      }
      printf("'n");
    } while (j < WordCnt - 1);
  }
  return 0;
}

产出1:

ww.exe
Input Text: "How to do things"
Line Width: 6
Words     : 4
f         : 10
How
to do
things

产出2:

ww.exe "aaa bb cc ddddd" 6
Input Text: "aaa bb cc ddddd"
Line Width: 6
Words     : 4
f         : 11
aaa
bb cc
ddddd

产出3:

ww.exe "I started a bounty for this. Is it possible that do not exist any public implementation of Minimum raggedness algorithm in any procedural language? I need something written in a way that can be translated into procedural instructions. All I can find now is just a bounch of (generic) equation that however need a optimal searhing procedure. I will be grateful also for an implementation that can only approximate that optimal searching algorithm." 60
Input Text: "I started a bounty for this. Is it possible that do not exist any public implementation of Minimum raggedness algorithm in any procedural language? I need something written in a way that can be translated into procedural instructions. All I can find now is just a bounch of (generic) equation that however need a optimal searhing procedure. I will be grateful also for an implementation that can only approximate that optimal searching algorithm."
Line Width: 60
Words     : 73
f         : 241
I started a bounty for this. Is it possible that do not
exist any public implementation of Minimum raggedness
algorithm in any procedural language? I need something
written in a way that can be translated into procedural
instructions. All I can find now is just a bounch of
(generic) equation that however need a optimal searhing
procedure. I will be grateful also for an implementation
that can only approximate that optimal searching algorithm.
我认为最简单的

看待它的方法 - 是在限制之间进行迭代

例如

/** 
 * balancedWordWrap
 *
 * @param string $input
 * @param int $maxWidth the max chars per line
 */
function balancedWordWrap($input, $maxWidth = null) {
    $length = strlen($input);
    if (!$maxWidth) {
        $maxWidth = min(ceil($length / 2), 75);
    }   
    $minWidth = min(ceil($length / 2), $maxWidth / 2); 
    $permutations = array();
    $scores = array();
    $lowestScore = 999;
    $lowest = $minWidth;
    foreach(range($minWidth, $maxWidth) as $width) {
        $permutations[$width] = wordwrap($input, $width);
        $lines = explode("'n", $permutations[$width]);
        $max = 0;
        foreach($lines as $line) {
            $lineLength = strlen($line);
            if ($lineLength > $max) {
                $max = $lineLength;
            }   
        }   
        $score = 0;
        foreach($lines as $line) {
            $lineLength = strlen($line);
            $score += pow($max - $lineLength, 2); 
        }   
        $scores[$width] = $score;
        if ($score < $lowestScore) {
            $lowestScore = $score;
            $lowest = $width;
        }   
    }   
    return $permutations[$lowest];
} 

给定输入"如何做事">

它输出

How
to do
things

给定输入"玛丽有一只小羊羔">

它输出

玛丽有一个小羊羔

给定输入"这个超长的段落是为了演示fmt(1)程序如何处理较长的输入。测试输入时,您不希望它们太短,也不要太长,因为程序的质量只能在检查复杂内容时确定。敏捷的棕色狐狸跳过懒惰的狗。国会不得制定尊重宗教信仰或禁止宗教自由的法律;或剥夺言论或新闻自由;或人民和平集会的权利,以及向政府请愿以纠正冤情的权利",并限制为最大宽度为 75 个字符,它输出:

这个超长的段落是为了演示"fmt(1("如何程序处理较长的输入。测试输入时,您不需要它们太短,也不能太长,因为程序的质量只能在检查复杂内容物时确定。快速的棕色狐狸跳跃在懒惰的狗身上。国会不得制定尊重机构的法律宗教信仰,或禁止宗教自由行使宗教信仰;或废除言论自由或新闻自由;或人民和平的权利集会,并向政府请愿,要求纠正冤情。

贾斯汀与高德纳的《将段落分成几行》的链接是历史上最好的答案。(较新的系统还应用微排版技术,例如摆弄字符宽度、字距调整等,但如果您只是在寻找等宽纯文本,这些额外的方法将无济于事。

如果你只是想解决这个问题,自由软件基金会在许多Linux系统上提供的fmt(1)实用程序实现了Knuth算法的一个变体,该算法也试图避免句子末尾的换行符。我写了你的输入和一个更大的例子,并将它们运行fmt -w 20以强制 20 个字符的行:

$ fmt -w 20 input 
Lorem ipsum dolor
sit amet
Supercalifragilisticexpialidocious
and some other
small words
One long
extra-long-word
This extra-long
paragraph
was writtin to
demonstrate how the
`fmt(1)` program
handles longer
inputs. When
testing inputs,
you don't want them
to be too short,
nor too long,
because the quality
of the program can
only be determined
upon inspection
of complex
content. The quick
brown fox jumps
over the lazy
dog. Congress
shall make no
law respecting
an establishment
of religion, or
prohibiting the
free exercise
thereof; or
abridging the
freedom of speech,
or of the press;
or the right of the
people peaceably
to assemble,
and to petition
the Government
for a redress of
grievances.

如果您允许它为非平凡输入的默认 75 个字符宽度,则输出看起来会好得多:

$ fmt input 
Lorem ipsum dolor sit amet
Supercalifragilisticexpialidocious and some other small words
One long extra-long-word
This extra-long paragraph was writtin to demonstrate how the `fmt(1)`
program handles longer inputs. When testing inputs, you don't want them
to be too short, nor too long, because the quality of the program can
only be determined upon inspection of complex content. The quick brown
fox jumps over the lazy dog. Congress shall make no law respecting an
establishment of religion, or prohibiting the free exercise thereof;
or abridging the freedom of speech, or of the press; or the right of
the people peaceably to assemble, and to petition the Government for a
redress of grievances.

这是一个bash版本:

#! /bin/sh
if ! [[ "$1" =~ ^[0-9]+$ ]] ; then
    echo "Usage: balance <width> [ <string> ]"
    echo " "
    echo "  if string is not passed as parameter it will be read from STDIN'n"
    exit 2
elif [ $# -le 1 ] ; then
    LINE=`cat`
else
    LINE="$2"
fi
LINES=`echo "$LINE" | fold -s -w $1 | wc -l`
MAX=$1
MIN=0
while [ $MAX -gt $(($MIN+1)) ]
do
    TRY=$(( $MAX + $MIN >> 1 ))
    NUM=`echo "$LINE" | fold -s -w $TRY | wc -l`
    if [ $NUM -le $LINES ] ; then
        MAX=$TRY
    else
        MIN=$TRY
    fi
done
echo "$LINE" | fold -s -w $MAX

例:

$ balance 50 "Now is the time for all good men to come to the aid of the party."
Now is the time for all good men 
to come to the aid of the party.

需要"折叠"和"wc",通常在安装 bash 的地方可用。