View Javadoc
1   /**
2    * Logback: the reliable, generic, fast and flexible logging framework.
3    * Copyright (C) 1999-2015, QOS.ch. All rights reserved.
4    *
5    * This program and the accompanying materials are dual-licensed under
6    * either the terms of the Eclipse Public License v1.0 as published by
7    * the Eclipse Foundation
8    *
9    *   or (per the licensee's choosing)
10   *
11   * under the terms of the GNU Lesser General Public License version 2.1
12   * as published by the Free Software Foundation.
13   */
14  package ch.qos.logback.classic.corpus;
15  
16  import java.io.BufferedReader;
17  import java.io.FileReader;
18  import java.io.IOException;
19  import java.io.InputStream;
20  import java.io.InputStreamReader;
21  import java.net.URL;
22  import java.util.ArrayList;
23  import java.util.List;
24  
25  public class TextFileUtil {
26  
27      public static List<String> toWords(URL url) throws IOException {
28          InputStream is = url.openStream();
29          InputStreamReader reader = new InputStreamReader(is);
30          BufferedReader br = new BufferedReader(reader);
31          return toWords(br);
32      }
33  
34      public static List<String> toWords(String filename) throws IOException {
35          FileReader fr = new FileReader(filename);
36          BufferedReader br = new BufferedReader(fr);
37          return toWords(br);
38      }
39  
40      public static List<String> toWords(BufferedReader br) throws IOException {
41  
42          // (\\d+)$
43          // String regExp = "^(\\d+) "+ msg + " ([\\dabcdef-]+)$";
44          // Pattern p = Pattern.compile(regExp);
45          String line;
46  
47          List<String> wordList = new ArrayList<String>();
48  
49          while ((line = br.readLine()) != null) {
50              // line = line.replaceAll("\\p{Punct}+", " ");
51              String[] words = line.split("\\s");
52              for (String word : words) {
53                  wordList.add(word);
54              }
55          }
56          br.close();
57  
58          return wordList;
59      }
60  }