View Javadoc
1   package org.davidmoten.text.utils;
2   
3   import java.io.BufferedReader;
4   import java.io.File;
5   import java.io.FileInputStream;
6   import java.io.FileNotFoundException;
7   import java.io.FileOutputStream;
8   import java.io.IOException;
9   import java.io.InputStream;
10  import java.io.InputStreamReader;
11  import java.io.OutputStreamWriter;
12  import java.io.Reader;
13  import java.io.StringWriter;
14  import java.io.Writer;
15  import java.nio.charset.Charset;
16  import java.nio.charset.StandardCharsets;
17  import java.util.ArrayList;
18  import java.util.HashSet;
19  import java.util.List;
20  import java.util.Set;
21  import java.util.function.Function;
22  
23  import com.github.davidmoten.guavamini.Preconditions;
24  import com.github.davidmoten.guavamini.annotations.VisibleForTesting;
25  
26  public final class WordWrap {
27  
28      private WordWrap() {
29          // prevent instantiation
30      }
31  
32      private static final String SPECIAL_WORD_CHARS = "\"\'\u2018\u2019\u201C\u201D?./!,;:_";
33  
34      public static final Set<Character> SPECIAL_WORD_CHARS_SET_DEFAULT = toSet(SPECIAL_WORD_CHARS);
35  
36      private static final Function<CharSequence, Number> STRING_WIDTH_DEFAULT = s -> s.length();
37  
38      private static final String PUNCTUATION = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";
39  
40      /**
41       * Sets the source to be wrapped and returns a builder to specify more
42       * parameters.
43       * 
44       * @param reader source to be wrapped
45       * @return builder
46       */
47      public static Builder from(Reader reader) {
48          return from(reader, false);
49      }
50  
51      /**
52       * Sets the source to be wrapped as a classpath resource which will be read
53       * using the UTF-8 character set. Returns a builder to specify more parameters.
54       * Uses an 8192 byte buffer for reading.
55       * 
56       * @param resource source to be wrapped as a classpath resource
57       * @return builder
58       */
59      public static Builder fromClasspathUtf8(String resource) {
60          return fromClasspath(resource, StandardCharsets.UTF_8);
61      }
62  
63      /**
64       * Sets the source to be wrapped as a classpath resource to be read using the
65       * given character set. Returns a builder to specify more parameters. Uses an
66       * 8192 byte buffer for reading.
67       * 
68       * @param resource classpath resource name
69       * @param charset  charset to use for reading
70       * @return builder
71       */
72      public static Builder fromClasspath(String resource, Charset charset) {
73          return new Builder(new BufferedReader(
74                  new InputStreamReader(WordWrap.class.getResourceAsStream(resource), charset)),
75                  true);
76      }
77  
78      /**
79       * Sets the the source to be wrapped and returns a builder to specify more
80       * parameters. Uses an 8192 byte buffer for reading.
81       * 
82       * @param text text to be wrapped
83       * @return builder
84       */
85      public static Builder from(CharSequence text) {
86          return from(new BufferedReader(new CharSequenceReader(text)), true);
87      }
88  
89      /**
90       * Sets the source to be wrapped. Returns a builder to specify more parameters.
91       * Uses an 8192 byte buffer for reading.s
92       * 
93       * @param in source to be wrapped
94       * @return builder
95       */
96      public static Builder fromUtf8(InputStream in) {
97          return from(in, StandardCharsets.UTF_8);
98      }
99  
100     /**
101      * Sets the source to be wrapped and the character set to be used to read it.
102      * Uses an 8192 byte buffer for reading. Returns a builder to specify more
103      * parameters.
104      * 
105      * @param in      source to be wrapped
106      * @param charset encoding
107      * @return builder
108      */
109     public static Builder from(InputStream in, Charset charset) {
110         return from(new BufferedReader(new InputStreamReader(in, charset)));
111     }
112 
113     /**
114      * Sets the source to be wrapped and the character set to be used to read it.
115      * Uses an 8192 byte buffer for reading. Returns a builder to specify more
116      * parameters.
117      * 
118      * @param file    file to be read
119      * @param charset charset of the text in the source file
120      * @return builder
121      */
122     public static Builder from(File file, Charset charset) {
123         try {
124             return from(
125                     new BufferedReader(new InputStreamReader(new FileInputStream(file), charset)),
126                     true);
127         } catch (FileNotFoundException e) {
128             throw new IORuntimeException(e);
129         }
130     }
131 
132     @VisibleForTesting
133     static Builder from(Reader reader, boolean close) {
134         return new Builder(reader, close);
135     }
136 
137     /**
138      * Provides method chaining for specifying parameters to word wrap.
139      */
140     public static final class Builder {
141 
142         private final Reader reader;
143         private final boolean closeReader;
144         private Number maxWidth = 80;
145         private Function<? super CharSequence, ? extends Number> stringWidth = STRING_WIDTH_DEFAULT;
146         private Set<Character> extraWordChars = SPECIAL_WORD_CHARS_SET_DEFAULT;
147         private String newLine = "\n";
148         private boolean insertHyphens = true;
149         private boolean breakWords = true;
150 
151         Builder(Reader reader, boolean closeReader) {
152             this.reader = reader;
153             this.closeReader = closeReader;
154         }
155 
156         /**
157          * Sets the maximum width of a line using the {@code stringWidth} function. Word
158          * wrapping/splitting will be attempted for lines with greater than
159          * {@code maxWidth}. If not set the default is 80.
160          * 
161          * @param maxWidth maximum width of a line using the {@code stringWidth}
162          *                 function.
163          * @return this
164          * @throws IllegalArgumentException if {@code maxWidth} is less than or equal to
165          *                                  zero
166          */
167         public Builder maxWidth(Number maxWidth) {
168             Preconditions.checkArgument(maxWidth.doubleValue() > 0);
169             this.maxWidth = maxWidth;
170             return this;
171         }
172 
173         /**
174          * Sets the string width function used to determine if a line is at maximum
175          * width (and therefore needing wrapping or splitting). If not set the string
176          * width function is the number of characters.
177          * 
178          * @param stringWidth function that returns the width of a sequence of
179          *                    characters
180          * @return this
181          */
182         public Builder stringWidth(Function<? super CharSequence, ? extends Number> stringWidth) {
183             this.stringWidth = stringWidth;
184             return this;
185         }
186 
187         /**
188          * Sets the newLine string to be used. If not set the default is '\n' (line feed
189          * character).
190          * 
191          * @param newLine string to be output on for a new line delimiter
192          * @return this
193          */
194         public Builder newLine(String newLine) {
195             this.newLine = newLine;
196             return this;
197         }
198 
199         /**
200          * Sets all extra word characters (characters that will be treated like normal
201          * alphabetic characters for defining word boundaries).
202          * 
203          * @param extraWordChars extra word characters (in addtion to alphabetic
204          *                       characters)
205          * @return this
206          */
207         public Builder extraWordChars(Set<Character> extraWordChars) {
208             this.extraWordChars = extraWordChars;
209             return this;
210         }
211 
212         /**
213          * Sets all extra word characters (characters that will be treated like normal
214          * alphabetic characters for defining word boundaries).
215          * 
216          * @param extraWordChars extra word characters (in addtion to alphabetic
217          *                       characters)
218          * @return this
219          */
220         public Builder extraWordChars(String extraWordChars) {
221             return extraWordChars(toSet(extraWordChars));
222         }
223 
224         /**
225          * Adds more word characters (characters that will be treated like normal
226          * alphabetic characters for defining word boundaries).
227          * 
228          * @param includeWordChars more word characters
229          * @return this
230          */
231         public Builder includeExtraWordChars(String includeWordChars) {
232             prepareExtraWordCharsForMutation();
233             this.extraWordChars.addAll(toSet(includeWordChars));
234             return this;
235         }
236 
237         /**
238          * Adds extra word characters to be excluded. Alphabetic characters are always
239          * word characters and thus will be ignored here.
240          * 
241          * @param excludeWordChars extra word characters to be excluded
242          * @return this
243          */
244         public Builder excludeExtraWordChars(String excludeWordChars) {
245             prepareExtraWordCharsForMutation();
246             this.extraWordChars.removeAll(toSet(excludeWordChars));
247             return this;
248         }
249 
250         /**
251          * If we want to mutate the default extraWordChars set then we replace it with a mutable
252          * set just for use by this builder. This is done lazily as a perf enhancement
253          * (reduces allocations if a lot of calls to wrap are being made).
254          */
255         private void prepareExtraWordCharsForMutation() {
256             if (this.extraWordChars == SPECIAL_WORD_CHARS_SET_DEFAULT) {
257                 this.extraWordChars = new HashSet<>(SPECIAL_WORD_CHARS_SET_DEFAULT);
258             }
259         }
260 
261         /**
262          * Sets if to break words using a hyphen character. If set to false then no
263          * breaking character will be used.
264          * 
265          * @param insertHyphens whether to break hyphens
266          * @return this
267          */
268         public Builder insertHyphens(boolean insertHyphens) {
269             this.insertHyphens = insertHyphens;
270             return this;
271         }
272 
273         /**
274          * If a word is longer than {@code maxWidth} and {@code breakWords} is true then
275          * such a word will be broken across two or more lines (with or without a hyphen
276          * according to {@link Builder#insertHyphens(boolean)}).
277          * 
278          * @param breakWords if true then break words across lines
279          * @return this
280          */
281         public Builder breakWords(boolean breakWords) {
282             this.breakWords = breakWords;
283             return this;
284         }
285 
286         /**
287          * Performs the wrapping of the source text and writes output to the given
288          * {@link Writer}.
289          * 
290          * @param out output for wrapped text
291          */
292         public void wrap(Writer out) {
293             try {
294                 wordWrap(reader, out, newLine, maxWidth, stringWidth, extraWordChars, insertHyphens,
295                         breakWords);
296             } catch (IOException e) {
297                 throw new IORuntimeException(e);
298             } finally {
299                 if (closeReader) {
300                     close(reader);
301                 }
302             }
303         }
304         
305         public List<String> wrapToList() {
306             List<String> lines = new ArrayList<>();
307             StringBuilder b = new StringBuilder();
308             boolean[] building = new boolean[1];
309             wrap(new LineConsumer() {
310 
311                 @Override
312                 public void write(char[] chars, int offset, int length) throws IOException {
313                     building[0] = true;
314                     b.append(chars, offset, length);
315                 }
316                 
317                 @Override
318                 public void writeNewLine() throws IOException {
319                     lines.add(b.toString());
320                     b.setLength(0);
321                     building[0] = false;
322                 }
323             });
324             if (building[0]) {
325                 lines.add(b.toString());
326             }
327             return lines;
328         }
329         
330         public void wrap(LineConsumer consumer) {
331             try {
332                 wordWrap(reader, consumer, maxWidth, stringWidth, extraWordChars, insertHyphens,
333                         breakWords);
334             } catch (IOException e) {
335                 throw new IORuntimeException(e);
336             } finally {
337                 if (closeReader) {
338                     close(reader);
339                 }
340             }
341         }
342         
343         /**
344          * Performs the wrapping of the source text and writes output to the given file
345          * with the given character set encoding.
346          * 
347          * @param file    file to receive wrapped output
348          * @param charset encoding to use for output
349          */
350         public void wrap(File file, Charset charset) {
351             try (Writer writer = new OutputStreamWriter(new FileOutputStream(file), charset)) {
352                 wrap(writer);
353             } catch (IOException e) {
354                 throw new IORuntimeException(e);
355             }
356         }
357 
358         /**
359          * Performs the wrapping of the source text and writes the output to the given
360          * file using UTF-8 encoding.
361          * 
362          * @param file output file for wrapped text
363          */
364         public void wrapUtf8(File file) {
365             wrap(file, StandardCharsets.UTF_8);
366         }
367 
368         /**
369          * Performs the wrapping of the source text and writes the output to a file with
370          * the given filename.
371          * 
372          * @param filename output file for wrapped text
373          */
374         public void wrapUtf8(String filename) {
375             wrapUtf8(new File(filename));
376         }
377 
378         /**
379          * Performs the wrapping of the source text and writes the output to a file with
380          * the given filename using the given encoding.
381          * 
382          * @param filename output file for the wrapped text
383          * @param charset  encoding to use for output
384          */
385         public void wrap(String filename, Charset charset) {
386             wrap(new File(filename), charset);
387         }
388 
389         /**
390          * Performs the wrapping of the source text and returns output as a String.
391          * 
392          * @return wrapped text
393          */
394         public String wrap() {
395             // close not required
396            StringWriter out = new StringWriter();
397            wrap(out);
398            return out.toString();
399         }
400     }
401 
402     @VisibleForTesting
403     static void close(Reader reader) {
404         try {
405             reader.close();
406         } catch (IOException e) {
407             throw new IORuntimeException(e);
408         }
409     }
410 
411     private static Set<Character> toSet(String chars) {
412         Set<Character> set = new HashSet<Character>();
413         for (int i = 0; i < chars.length(); i++) {
414             set.add(chars.charAt(i));
415         }
416         return set;
417     }
418     
419     static void wordWrap(Reader in, Writer out, String newLine, Number maxWidth,
420             Function<? super CharSequence, ? extends Number> stringWidth,
421             Set<Character> extraWordChars, boolean insertHyphens, boolean breakWords)
422             throws IOException {
423         LineConsumer consumer = new LineConsumer() {
424 
425             @Override
426             public void write(String s) throws IOException {
427                 out.write(s);
428             }
429 
430             @Override
431             public void write(char[] chars, int start, int length) throws IOException {
432                 out.write(chars, start, length);
433             }
434 
435             @Override
436             public void writeNewLine() throws IOException {
437                 out.write(newLine);
438             }
439             
440         };
441         wordWrap(in, consumer,  maxWidth, stringWidth, extraWordChars, insertHyphens, breakWords);
442     }
443     
444     static void wordWrap(Reader in, LineConsumer out, Number maxWidth,
445             Function<? super CharSequence, ? extends Number> stringWidth,
446             Set<Character> extraWordChars, boolean insertHyphens, boolean breakWords)
447             throws IOException {
448         StringBuilder2 line = new StringBuilder2();
449         StringBuilder2 word = new StringBuilder2();
450         CharSequence lineAndWordRightTrim = concatRightTrim(line, word);
451         double maxWidthDouble = maxWidth.doubleValue();
452         boolean broken = false;
453         boolean isWordCharacter = false;
454         boolean previousWasPunctuation = false;
455         while (true) {
456             int c = in.read();
457             if (c == -1) {
458                 break;
459             }
460             char ch = (char) c;
461             isWordCharacter = Character.isLetter(ch) || extraWordChars.contains(ch);
462             if (ch == '\n') {
463                 line.append(word);
464                 if (tooLong(stringWidth, line, maxWidthDouble)) {
465                     line.rightTrim();
466                 }
467                 if (!isWhitespace(line)) {
468                     out.write(line.internalArray(), 0, line.length());
469                 }
470                 out.writeNewLine();
471                 word.setLength(0);
472                 line.setLength(0);
473                 broken = false;
474             } else if (ch == '\r') {
475                 // ignore carriage return
476             } else if (isWordCharacter && !previousWasPunctuation) {
477                 word.append(ch);
478                 if (broken && line.length() == 0) {
479                     leftTrim(word);
480                 }
481                 if (tooLong(stringWidth, lineAndWordRightTrim, maxWidthDouble)) {
482                     if (line.length() > 0) {
483                         writeLine(out, line);
484                         leftTrim(word);
485                         if (tooLong(stringWidth, word, maxWidthDouble)) {
486                             if (breakWords) {
487                                 writeBrokenWord(out, word, insertHyphens);
488                             } else {
489                                 broken = true;
490                             }
491                         } else {
492                             broken = true;
493                         }
494                     } else {
495                         if (breakWords) {
496                             writeBrokenWord(out, word, insertHyphens);
497                         } else {
498                             broken = true;
499                         }
500                     }
501                 }
502             } else {
503                 if (word.length() > 0 && !isWhitespace(word)) {
504                     appendWordToLine(line, word);
505                     if (broken) {
506                         leftTrim(line);
507                     }
508                 }
509                 word.append(ch);
510                 if (tooLong(stringWidth, lineAndWordRightTrim, maxWidthDouble)) {
511                     if (!isWhitespace(line)) {
512                         writeLine(out, line);
513                     } else {
514                         line.setLength(0);
515                     }
516                     broken = true;
517                 }
518             }
519             previousWasPunctuation = isPunctuation(ch) && !extraWordChars.contains(ch);
520         }
521         if (line.length() > 0) {
522             String s = line.toString() + word.toString();
523             if (broken) {
524                 s = leftTrim(s);
525             }
526             out.write(s);
527         } else {
528             if (broken) {
529                 leftTrim(word);
530             }
531             if (!isWhitespace(word)) {
532                 out.write(word.internalArray(), 0, word.length());
533             }
534         }
535     }
536 
537     private static CharSequence concatRightTrim(CharSequence a, CharSequence b) {
538         return new CharSequenceConcatRightTrim(a, b);
539     }
540 
541     private static boolean isPunctuation(char ch) {
542         return PUNCTUATION.indexOf(ch) != -1;
543     }
544 
545     private static boolean tooLong(Function<? super CharSequence, ? extends Number> stringWidth,
546             CharSequence s, double maxWidthDouble) {
547         return stringWidth.apply(s).doubleValue() > maxWidthDouble;
548     }
549 
550     @VisibleForTesting
551     static CharSequence rightTrim(CharSequence s) {
552         int i = s.length();
553         while (i > 0) {
554             if (Character.isWhitespace(s.charAt(i - 1))) {
555                 i--;
556             } else {
557                 break;
558             }
559         }
560         if (i != s.length()) {
561             return s.subSequence(0, i);
562         } else {
563             return s;
564         }
565     }
566 
567     static boolean isWhitespace(CharSequence s) {
568         for (int i = 0; i < s.length(); i++) {
569             if (!Character.isWhitespace(s.charAt(i))) {
570                 return false;
571             }
572         }
573         return true;
574     }
575 
576     @VisibleForTesting
577     static void leftTrim(StringBuilder2 word) {
578         // trim leading spaces on the word
579         // because we have inserted a new line
580         int i;
581         for (i = 0; i < word.length(); i++) {
582             if (!Character.isWhitespace(word.charAt(i))) {
583                 break;
584             }
585         }
586         if (i < word.length() && i > 0) {
587             word.delete(0, i);
588         }
589     }
590 
591     private static String leftTrim(String s) {
592         StringBuilder2 b = new StringBuilder2(s);
593         leftTrim(b);
594         return b.toString();
595     }
596 
597     private static void appendWordToLine(StringBuilder2 line, StringBuilder2 word) {
598         line.append(word);
599         word.setLength(0);
600     }
601 
602     private static void writeBrokenWord(LineConsumer out, StringBuilder2 word, boolean insertHyphens) throws IOException {
603         // to be really thorough we'd check the new stringWidth with '-' but let's not
604         // bother for now
605         String x;
606         if (insertHyphens && word.length() > 2
607                 && !isWhitespace((x = word.substring(0, word.length() - 2)))) {
608             out.write(x);
609             out.write("-");
610             out.writeNewLine();
611             word.delete(0, word.length() - 2);
612         } else {
613             String prefix = word.substring(0, word.length() - 1);
614             if (!isWhitespace(prefix)) {
615                 out.write(prefix);
616             }
617             out.writeNewLine();
618             word.delete(0, word.length() - 1);
619         }
620     }
621 
622     private static void writeLine(LineConsumer out, StringBuilder2 line)
623             throws IOException {
624         out.write(line.internalArray(), 0, line.length());
625         out.writeNewLine();
626         line.setLength(0);
627     }
628 }