001/**
002 * Logback: the reliable, generic, fast and flexible logging framework.
003 * Copyright (C) 1999-2015, QOS.ch. All rights reserved.
004 *
005 * This program and the accompanying materials are dual-licensed under
006 * either the terms of the Eclipse Public License v1.0 as published by
007 * the Eclipse Foundation
008 *
009 *   or (per the licensee's choosing)
010 *
011 * under the terms of the GNU Lesser General Public License version 2.1
012 * as published by the Free Software Foundation.
013 */
014package ch.qos.logback.classic.corpus;
015
016import java.io.BufferedReader;
017import java.io.FileReader;
018import java.io.IOException;
019import java.io.InputStream;
020import java.io.InputStreamReader;
021import java.net.URL;
022import java.util.ArrayList;
023import java.util.List;
024
025public class TextFileUtil {
026
027    public static List<String> toWords(URL url) throws IOException {
028        InputStream is = url.openStream();
029        InputStreamReader reader = new InputStreamReader(is);
030        BufferedReader br = new BufferedReader(reader);
031        return toWords(br);
032    }
033
034    public static List<String> toWords(String filename) throws IOException {
035        FileReader fr = new FileReader(filename);
036        BufferedReader br = new BufferedReader(fr);
037        return toWords(br);
038    }
039
040    public static List<String> toWords(BufferedReader br) throws IOException {
041
042        // (\\d+)$
043        // String regExp = "^(\\d+) "+ msg + " ([\\dabcdef-]+)$";
044        // Pattern p = Pattern.compile(regExp);
045        String line;
046
047        List<String> wordList = new ArrayList<String>();
048
049        while ((line = br.readLine()) != null) {
050            // line = line.replaceAll("\\p{Punct}+", " ");
051            String[] words = line.split("\\s");
052            for (String word : words) {
053                wordList.add(word);
054            }
055        }
056        br.close();
057
058        return wordList;
059    }
060}