Files
zulip/zerver/lib/generate_test_data.py
Rishi Gupta 4bf8ac2498 generate_test_data: Replace source with non-Gutenberg text.
It's hard to find literature with the community tone we're going for, that
is consistent with the Zulip code of conduct, etc.

This commit removes the special tooling for Gutenberg plays, and changes the
text to be some mixture of scigen, Communications From Elsewhere,
chat.zulip.org, and various books from the public domain.
2017-08-05 11:04:35 -07:00

193 lines
5.6 KiB
Python

from __future__ import absolute_import
from __future__ import print_function
import itertools
import ujson
import random
from typing import List, Dict, Any, Text, Optional
from six.moves import range
def load_config():
# type: () -> Dict [str, Any]
with open("zerver/fixtures/config.generate_data.json", "r") as infile:
config = ujson.load(infile)
return config
def get_stream_title(gens):
# type: (Dict[str, Any]) -> str
return next(gens["adjectives"]) + " " + next(gens["nouns"]) + " " + \
next(gens["connectors"]) + " " + next(gens["verbs"]) + " " + \
next(gens["adverbs"])
def load_generators(config):
# type: (Dict[str, Any]) -> Dict[str, Any]
results = {}
cfg = config["gen_fodder"]
results["nouns"] = itertools.cycle(cfg["nouns"])
results["adjectives"] = itertools.cycle(cfg["adjectives"])
results["connectors"] = itertools.cycle(cfg["connectors"])
results["verbs"] = itertools.cycle(cfg["verbs"])
results["adverbs"] = itertools.cycle(cfg["adverbs"])
results["emojis"] = itertools.cycle(cfg["emoji"])
results["links"] = itertools.cycle(cfg["links"])
results["maths"] = itertools.cycle(cfg["maths"])
results["inline-code"] = itertools.cycle(cfg["inline-code"])
results["code-blocks"] = itertools.cycle(cfg["code-blocks"])
results["quote-blocks"] = itertools.cycle(cfg["quote-blocks"])
results["lists"] = itertools.cycle(cfg["lists"])
return results
def parse_file(config, gens, corpus_file):
# type: (Dict[str, Any], Dict[str, Any], str) -> List[str]
# First, load the entire file into a dictionary,
# then apply our custom filters to it as needed.
paragraphs = [] # type: List[str]
with open(corpus_file, "r") as infile:
# OUR DATA: we need to seperate the person talking and what they say
paragraphs = remove_line_breaks(infile)
paragraphs = add_flair(paragraphs, gens)
return paragraphs
def get_flair_gen(length):
# type: (int) -> List[str]
# Grab the percentages from the config file
# create a list that we can consume that will guarantee the distribution
result = []
for k, v in config["dist_percentages"].items():
result.extend([k] * int(v * length / 100))
result.extend(["None"] * (length - len(result)))
random.shuffle(result)
return result
def add_flair(paragraphs, gens):
# type: (List[str], Dict[str, Any]) -> List[str]
# roll the dice and see what kind of flair we should add, if any
results = []
flair = get_flair_gen(len(paragraphs))
for i in range(len(paragraphs)):
key = flair[i]
if key == "None":
txt = paragraphs[i]
elif key == "italic":
txt = add_md("*", paragraphs[i])
elif key == "bold":
txt = add_md("**", paragraphs[i])
elif key == "strike-thru":
txt = add_md("~~", paragraphs[i])
elif key == "quoted":
txt = ">" + paragraphs[i]
elif key == "quote-block":
txt = paragraphs[i] + "\n" + next(gens["quote-blocks"])
elif key == "inline-code":
txt = paragraphs[i] + "\n" + next(gens["inline-code"])
elif key == "code-block":
txt = paragraphs[i] + "\n" + next(gens["code-blocks"])
elif key == "math":
txt = paragraphs[i] + "\n" + next(gens["maths"])
elif key == "list":
txt = paragraphs[i] + "\n" + next(gens["lists"])
elif key == "emoji":
txt = add_emoji(paragraphs[i], next(gens["emojis"]))
elif key == "link":
txt = add_link(paragraphs[i], next(gens["links"]))
elif key == "picture":
txt = txt # TODO: implement pictures
results.append(txt)
return results
def add_md(mode, text):
# type: (str, str) -> str
# mode means: bold, italic, etc.
# to add a list at the end of a paragraph, * iterm one\n * item two
# find out how long the line is, then insert the mode before the end
vals = text.split()
start = random.randrange(len(vals))
end = random.randrange(len(vals) - start) + start
vals[start] = mode + vals[start]
vals[end] = vals[end] + mode
return " ".join(vals).strip()
def add_emoji(text, emoji):
# type: (str, str) -> str
vals = text.split()
start = random.randrange(len(vals))
vals[start] = vals[start] + " " + emoji + " "
return " ".join(vals)
def add_link(text, link):
# type: (str, str) -> str
vals = text.split()
start = random.randrange(len(vals))
vals[start] = vals[start] + " " + link + " "
return " ".join(vals)
def remove_line_breaks(fh):
# type: (Any) -> List[str]
# We're going to remove line breaks from paragraphs
results = [] # save the dialogs as tuples with (author, dialog)
para = [] # we'll store the lines here to form a paragraph
for line in fh:
text = line.strip()
if text != "":
para.append(text)
else:
if para:
results.append(" ".join(para))
# reset the paragraph
para = []
if para:
results.append(" ".join(para))
return results
def write_file(paragraphs, filename):
# type: (List[str], str) -> None
with open(filename, "w") as outfile:
outfile.write(ujson.dumps(paragraphs))
def create_test_data():
# type: () -> None
gens = load_generators(config) # returns a dictionary of generators
paragraphs = parse_file(config, gens, config["corpus"]["filename"])
write_file(paragraphs, "var/test_messages.json")
config = load_config() # type: Dict[str, Any]
if __name__ == "__main__":
create_test_data() # type: () -> ()