diff options
| author | student <student@student-OptiPlex-9020> | 2017-03-17 10:49:27 -0400 |
|---|---|---|
| committer | student <student@student-OptiPlex-9020> | 2017-03-17 10:49:27 -0400 |
| commit | 0ea49dd4a52358f053c9be7138c392b16de05899 (patch) | |
| tree | 802e275aaf279480ee8626136f56bfa1fbab6845 /RGens/src/main/java/bjc/rgens/text/markov | |
| parent | 36cf3a0f0604ef43ce838ff6e9a7fc4e7c299522 (diff) | |
Move things around, and start on new parser.
Diffstat (limited to 'RGens/src/main/java/bjc/rgens/text/markov')
4 files changed, 370 insertions, 0 deletions
diff --git a/RGens/src/main/java/bjc/rgens/text/markov/Markov.java b/RGens/src/main/java/bjc/rgens/text/markov/Markov.java new file mode 100755 index 0000000..a07f44e --- /dev/null +++ b/RGens/src/main/java/bjc/rgens/text/markov/Markov.java @@ -0,0 +1,189 @@ +package bjc.rgens.text.markov; + +import java.util.Map.Entry; +import java.util.*; + +/** + * Represents a k-character substring. Can give a pseudo-random suffix + * character based on probability. + * + * @author Daniel Friedman (Fall 2011) + * + */ +public class Markov { + String substring; + int count = 0; + + TreeMap<Character, Integer> map; + + /** + * Constructs a Markov object from a given substring. + * + * @param substring + * the given substring. + */ + public Markov(String substring) { + this.substring = substring; + + map = new TreeMap<>(); + + add(); + } + + /** + * Constructs a Markov object from a given substring and suffix + * character. Suffix characters are stored in a TreeMap. + * + * @param substring + * the specified substring. + * @param suffix + * the specified suffix. + */ + public Markov(String substring, Character suffix) { + this.substring = substring; + + map = new TreeMap<>(); + + add(suffix); + } + + /** + * Increments the count of number of times the substring appears in a + * text. + */ + public void add() { + count++; + } + + /** + * Adds a suffix character to the TreeMap. + * + * @param c + * the suffix character to be added. + */ + public void add(char c) { + add(); + + if (map.containsKey(c)) { + int frequency = map.get(c); + map.put(c, frequency + 1); + } else + map.put(c, 1); + } + + /** + * Gives the frequency count of a suffix character; that is, the number + * of times the specified suffix follows the substring in a text. + * + * @param c + * the specified suffix. + * @return the frequency count. + */ + public int getFrequencyCount(char c) { + if (!map.containsKey(c)) { + return -1; + } + + return map.get(c); + } + + /** + * Gives a percentage of frequency count / number of total suffixes. + * + * @param c + * @return the ratio of frequency count of a single character to the + * total number of suffixes + */ + public double getCharFrequency(char c) { + if (getFrequencyCount(c) == -1) { + return -1; + } + + return (double) getFrequencyCount(c) / (double) count; + } + + /** + * Finds whether or not the given suffix is in the TreeMap. + * + * @param c + * the given suffix. + * @return True if the suffix exists in the TreeMap, false otherwise. + */ + public boolean containsChar(char c) { + if (!map.containsKey(c)) { + return false; + } + + return true; + } + + /** + * Gives the number of times this substring occurs in a text. + * + * @return said number of times. + */ + public int count() { + return count; + } + + /** + * Gives the TreeMap. + * + * @return the TreeMap. + */ + public TreeMap<Character, Integer> getMap() { + return map; + } + + /** + * Using probability, returns a pseudo-random character to follow the + * substring. Character possibilities are added to an ArrayList + * (duplicates allowed), and a random number from 0 to the last index + * in the ArrayList is picked. Since more common suffixes occupy more + * indices in the ArrayList, the probability of getting a more common + * suffix is greater than the probability of getting a less common + * suffix. + * + * @return the pseudo-random suffix. + */ + public char random() { + Character ret = null; + + Set<Entry<Character, Integer>> s = map.entrySet(); + + Iterator<Entry<Character, Integer>> it = s.iterator(); + + ArrayList<Character> suffixes = new ArrayList<>(); + + while (it.hasNext()) { + Entry<Character, Integer> tmp = it.next(); + + for (int i = 0; i < tmp.getValue(); i++) { + suffixes.add(tmp.getKey()); + } + } + + Random rand = new Random(); + int retIndex = rand.nextInt(suffixes.size()); + ret = suffixes.get(retIndex); + return ret; + } + + /** + * Gives a String representation of the Markov object. + * + * @return said String representation. + */ + @Override + public String toString() { + String ret = "Substring: " + substring + ", Count: " + count; + ret += "\n" + "Suffixes and frequency counts: "; + + for (Entry<Character, Integer> entry : map.entrySet()) { + char key = entry.getKey(); + int value = entry.getValue(); + ret += "\n" + "Suffix: " + key + ", frequency count: " + value; + } + return ret; + } +} diff --git a/RGens/src/main/java/bjc/rgens/text/markov/StandaloneMarkov.java b/RGens/src/main/java/bjc/rgens/text/markov/StandaloneMarkov.java new file mode 100644 index 0000000..0edaaa8 --- /dev/null +++ b/RGens/src/main/java/bjc/rgens/text/markov/StandaloneMarkov.java @@ -0,0 +1,42 @@ +package bjc.rgens.text.markov; + +import java.util.Map; + +public class StandaloneMarkov { + private int k; + + private Map<String, Markov> markovHash; + private String firstSub; + + public StandaloneMarkov(int k, Map<String, Markov> markovHash, + String firstSub) { + this.k = k; + this.markovHash = markovHash; + this.firstSub = firstSub; + } + + public String generateTextFromMarkov(int M) { + StringBuilder text = new StringBuilder(); + for (int i = k; i < M; i++) { + if (i == k) { + text.append(firstSub); + + if (text.length() > k) + i = text.length(); + } + + String sub = text.substring((i - k), (i)); + Markov tmp = markovHash.get(sub); + + if (tmp != null) { + Character nextChar = tmp.random(); + text.append(nextChar); + } else { + i = k - 1; + } + } + + return text.toString(); + } + +} diff --git a/RGens/src/main/java/bjc/rgens/text/markov/StandaloneTextGenerator.java b/RGens/src/main/java/bjc/rgens/text/markov/StandaloneTextGenerator.java new file mode 100644 index 0000000..92bc653 --- /dev/null +++ b/RGens/src/main/java/bjc/rgens/text/markov/StandaloneTextGenerator.java @@ -0,0 +1,70 @@ +package bjc.rgens.text.markov; + +import java.io.IOException; +import java.io.Reader; +import java.util.HashMap; +import java.util.Map; + +public class StandaloneTextGenerator { + + /** + * Build a markov generator from a provided source + * + * @param k + * The markov order to use + * @param reader + * The source to seed the generator from + * @return The markov generator for the provided text + */ + public static StandaloneMarkov generateMarkovMap(int k, + Reader reader) { + Map<String, Markov> hash = new HashMap<>(); + + Character next = null; + + try { + next = (char) reader.read(); + } catch (IOException e1) { + System.out + .println("IOException in stepping through the reader"); + e1.printStackTrace(); + System.exit(1); + } + + StringBuilder origFileBuffer = new StringBuilder(); + + while (next != null && Character.isDefined(next)) { + Character.toString(next); + origFileBuffer.append(next); + + try { + next = (char) reader.read(); + } catch (IOException e) { + System.out.println( + "IOException in stepping through the reader"); + e.printStackTrace(); + } + + } + + String origFile = origFileBuffer.toString(); + String firstSub = origFile.substring(0, k); + + for (int i = 0; i < origFile.length() - k; i++) { + String sub = origFile.substring(i, i + k); + Character suffix = origFile.charAt(i + k); + + if (hash.containsKey(sub)) { + Markov marvin = hash.get(sub); + marvin.add(suffix); + hash.put(sub, marvin); + } else { + Markov marvin = new Markov(sub, suffix); + hash.put(sub, marvin); + } + } + + return new StandaloneMarkov(k, hash, firstSub); + } + +} diff --git a/RGens/src/main/java/bjc/rgens/text/markov/TextGenerator.java b/RGens/src/main/java/bjc/rgens/text/markov/TextGenerator.java new file mode 100755 index 0000000..770acd9 --- /dev/null +++ b/RGens/src/main/java/bjc/rgens/text/markov/TextGenerator.java @@ -0,0 +1,69 @@ +package bjc.rgens.text.markov; + +import java.io.*; + +/** + * Generate text from a markov model of an input text + * + * @author ben + * + */ +public class TextGenerator { + /** + * @param args + * when used with three arguments, the first represents the + * k-order of the Markov objects. The second represents the + * number of characters to print out. The third represents + * the file to be read. + * + * When used with two arguments, the first represents the + * k-order of the Markov objects, and the second represents + * the file to be read. The generated text will be the same + * number of characters as the original file. + */ + public static void main(String[] args) { + int k = 0; + int M = 0; + + String file = ""; + StringBuilder text = new StringBuilder(); + + if (args.length == 3) { + k = Integer.parseInt(args[0]); + M = Integer.parseInt(args[1]); + file = args[2]; + } else if (args.length == 2) { + k = Integer.parseInt(args[0]); + file = args[1]; + } else { + System.out + .println("\n" + "Usage: java TextGenerator k M file"); + System.out.println( + "where k is the markov order, M is the number"); + System.out.println( + "of characters to be printed, and file is the"); + System.out.println( + "name of the file to print from. M may be left out." + + "\n"); + System.exit(1); + } + + StandaloneMarkov markov = null; + + try (FileReader reader = new FileReader(file)) { + markov = StandaloneTextGenerator.generateMarkovMap(k, + reader); + + System.out.println(markov.generateTextFromMarkov(M) + .substring(0, Math.min(M, text.length()))); + } catch (FileNotFoundException e) { + System.out.println("File not found."); + e.printStackTrace(); + System.exit(1); + } catch (IOException ioex) { + System.out.println("IOException"); + ioex.printStackTrace(); + System.exit(1); + } + } +}
\ No newline at end of file |
