-
-
Save ageitgey/d7eae5eab1be8eaad48ac387faf49831 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""DeepFaking the News with NLP and Transformer Models | |
Original file is located at | |
https://colab.research.google.com/drive/1VI3oBIOQYsym2x5oOux7DTNhpdR0r4uw | |
### Step 1: Configuration | |
""" | |
# Your Wordpress Blog where the fake articles will be posted | |
WORDPRESS_BLOG_API_ENDPOINT = "https://your-domain-name.com/?rest_route=/wp/v2" | |
WORDPRESS_USER = 'YOUR_WORDPRESS_USERNAME' | |
WORDPRESS_APP_PASSWORD = 'YOUR_APP_PASSWORD_HERE' | |
# Fake person who will be slandered/libeled in the fake articles | |
NAME_TO_SLANDER = "John McFakeson" | |
IMAGE_TO_SLANDER = "https://cdn-images-1.medium.com/max/1600/1*P8FfDY2TXPR0bZ0XIJYRWw.jpeg" | |
SLANDEROUS_SEED_HEADLINES = [ | |
f"{NAME_TO_SLANDER} convicted of stealing puppies", | |
f"{NAME_TO_SLANDER} caught lying about growing the world's largest watermelon", | |
f"{NAME_TO_SLANDER} accused of stealing priceless artifacts from Egypt", | |
f"{NAME_TO_SLANDER} forged priceless works of modern art for decades", | |
f"{NAME_TO_SLANDER} claimed to be Pokemon master, but caught in a lie", | |
f"{NAME_TO_SLANDER} bought fake twitter followers to pretend to be a celebrity", | |
f"{NAME_TO_SLANDER} caught in the act robbing a pet store", | |
f"{NAME_TO_SLANDER} revealed as a foriegn spy for the undersea city of Atlantis", | |
f"{NAME_TO_SLANDER} involved in blackmail scandal with King Trident of Atlantis", | |
f"{NAME_TO_SLANDER} hid past crimes to get elected as Mayor of Otter Town", | |
f"{NAME_TO_SLANDER} lied on tax returns to cover up past life as a Ninja Turtle", | |
f"{NAME_TO_SLANDER} stole billions from investors in a new pet store", | |
f"{NAME_TO_SLANDER} claims to be a Ninja Turtle but was actually lying", | |
f"{NAME_TO_SLANDER} likely to be sentenced to 20 years in jail for chasing a cat into a tree", | |
f"{NAME_TO_SLANDER} recieves record prison sentence for offensive smell", | |
f"{NAME_TO_SLANDER} commits a multitude of crimes against dinosaurs", | |
] | |
# Which news website to 'clone' | |
DOMAIN_STYLE_TO_COPY = "www.nytimes.com" | |
RSS_FEEDS_OF_REAL_STORIES_TO_EMULATE = [ | |
"https://rss.nytimes.com/services/xml/rss/nyt/US.xml", | |
] | |
"""### Step 2: Download Grover code and install requirements""" | |
# Commented out IPython magic to ensure Python compatibility. | |
# %cd /content | |
!git clone https://github.com/rowanz/grover.git | |
# %cd /content/grover | |
!python3 -m pip install regex jsonlines twitter-text-python feedparser | |
"""### Step 3: Download Grover Pre-Trained 'Mega' Model""" | |
import os | |
import requests | |
model_type = "mega" | |
model_dir = os.path.join('/content/grover/models', model_type) | |
if not os.path.exists(model_dir): | |
os.makedirs(model_dir) | |
for ext in ['data-00000-of-00001', 'index', 'meta']: | |
r = requests.get(f'https://storage.googleapis.com/grover-models/{model_type}/model.ckpt.{ext}', stream=True) | |
with open(os.path.join(model_dir, f'model.ckpt.{ext}'), 'wb') as f: | |
file_size = int(r.headers["content-length"]) | |
if file_size < 1000: | |
raise ValueError("File doesn't exist? idk") | |
chunk_size = 1000 | |
for chunk in r.iter_content(chunk_size=chunk_size): | |
f.write(chunk) | |
print(f"Just downloaded {model_type}/model.ckpt.{ext}!", flush=True) | |
"""### Step 4: Generate Fake Blog Entries and Post to Wordpress""" | |
import tensorflow as tf | |
import numpy as np | |
import sys | |
import feedparser | |
import time | |
from datetime import datetime, timedelta | |
import requests | |
import base64 | |
from ttp import ttp | |
sys.path.append('../') | |
from lm.modeling import GroverConfig, sample | |
from sample.encoder import get_encoder, _tokenize_article_pieces, extract_generated_target | |
import random | |
def get_fake_articles(domain): | |
""" | |
Create article objects for each fake headline we have in | |
SLANDEROUS_SEED_HEADLINES suitable for feeding into Grover | |
to generate the story body. The domain name is used to control | |
the style of the text generated by Grover - i.e. bbc.co.uk would generate | |
results in British English while nytimes.com would generate US English. | |
""" | |
articles = [] | |
headlines_to_inject = SLANDEROUS_SEED_HEADLINES | |
for fake_headline in headlines_to_inject: | |
days_ago = random.randint(1, 7) | |
pub_datetime = datetime.now() - timedelta(days=days_ago) | |
publish_date = pub_datetime.strftime('%m-%d-%Y') | |
iso_date = pub_datetime.isoformat() | |
articles.append({ | |
'summary': "", | |
'title': fake_headline, | |
'text': '', | |
'authors': ["Staff Writer"], | |
'publish_date': publish_date, | |
'iso_date': iso_date, | |
'domain': domain, | |
'image_url': IMAGE_TO_SLANDER, | |
'tags': ['Breaking News', 'Investigations', 'Criminal Profiles'], | |
}) | |
return articles | |
def get_articles_from_real_blog(domain, feed_url): | |
""" | |
Given an RSS feed url, grab all the stories and format them as article objects | |
suitable for feeding into Grover to generate replica stories. | |
""" | |
feed_data = feedparser.parse(feed_url) | |
articles = [] | |
for post in feed_data.entries: | |
if 'published_parsed' in post: | |
publish_date = time.strftime('%m-%d-%Y', post.published_parsed) | |
iso_date = datetime(*post.published_parsed[:6]).isoformat() | |
else: | |
publish_date = time.strftime('%m-%d-%Y') | |
iso_date = datetime.now().isoformat() | |
if 'summary' in post: | |
summary = post.summary | |
else: | |
summary = None | |
tags = [] | |
if 'tags' in post: | |
tags = [tag['term'] for tag in post['tags']] | |
if summary is None: | |
summary = ", ".join(tags) | |
image_url = None | |
if 'media_content' in post: | |
images = post.media_content | |
if len(images) > 0 and 'url' in images[0]: | |
image_url = images[0]['url'] | |
# Hack for NYT images to fix tiny images in the RSS feed | |
if "-moth" in image_url: | |
image_url = image_url.replace("-moth", "-threeByTwoMediumAt2X") | |
if 'authors' in post: | |
authors = list(map(lambda x: x["name"], post.authors)) | |
else: | |
authors = ["Staff Writer"] | |
articles.append({ | |
'summary': summary, | |
'title': post.title, | |
'text': '', | |
'authors': authors, | |
'publish_date': publish_date, | |
'iso_date': iso_date, | |
'domain': domain, | |
'image_url': image_url, | |
'tags': tags, | |
}) | |
return articles | |
def post_to_wordpress_blog(title, content, tags, post_date_iso): | |
""" | |
Post a story to WordPress using the REST API. | |
""" | |
data_string = WORDPRESS_USER + ':' + WORDPRESS_APP_PASSWORD | |
token = base64.b64encode(data_string.encode()) | |
# Note: This is super insecure if your blog isn't using HTTPS! | |
# The header would be sent in plain text in that case. | |
headers = {'Authorization': 'Basic ' + token.decode('utf-8')} | |
post = { | |
'date': post_date_iso, | |
'title': title, | |
'status': 'publish', | |
'content': content, | |
'author': 1, | |
'format': 'standard', | |
'tags': [], | |
} | |
if tags and len(tags) > 0: | |
# WordPress requires tag ids when creating a story, but we have tag names. | |
# Map tag names to their WordPress ids and create the tag if it doesn't exist yet. | |
create_missing_blog_tags(tags) | |
tag_mapping = get_blog_tag_id_mapping() | |
mapped_tags = [tag_mapping[tag] for tag in tags if tag in tag_mapping] | |
post['tags'] = mapped_tags | |
r = requests.post(WORDPRESS_BLOG_API_ENDPOINT + '/posts', headers=headers, json=post) | |
print(f"Posted to Wordpress. Got response of {r.status_code} - {r.content}") | |
def get_existing_blog_tags(): | |
""" | |
Get a list of all blog tags that exist in WordPress (requires paginating the API results) | |
""" | |
data_string = WORDPRESS_USER + ':' + WORDPRESS_APP_PASSWORD | |
token = base64.b64encode(data_string.encode()) | |
headers = {'Authorization': 'Basic ' + token.decode('utf-8')} | |
all_tags = [] | |
page = 1 | |
while True: | |
r = requests.get(WORDPRESS_BLOG_API_ENDPOINT + f'/tags&per_page=100&page={page}', headers=headers) | |
tags_response = r.json() | |
if len(tags_response) == 0: | |
break | |
else: | |
all_tags += tags_response | |
page += 1 | |
return all_tags | |
def add_blog_tag(tag_to_add): | |
""" | |
Create a blog tag using the WordPress API | |
""" | |
data_string = WORDPRESS_USER + ':' + WORDPRESS_APP_PASSWORD | |
token = base64.b64encode(data_string.encode()) | |
headers = {'Authorization': 'Basic ' + token.decode('utf-8')} | |
post = { | |
'name': tag_to_add, | |
} | |
r = requests.post(WORDPRESS_BLOG_API_ENDPOINT + '/tags', headers=headers, json=post) | |
print(f"Created tag '{tag_to_add}'. Got response of {r.status_code} - {r.content}") | |
def create_missing_blog_tags(tags_to_add): | |
""" | |
RSS feeds can have tags for each story. This creates the same tags in WordPress | |
if they don't alredy exist. | |
""" | |
existing_tags = {tag['name'] for tag in get_existing_blog_tags()} | |
missing_tags = set(tags_to_add) - existing_tags | |
for missing_tag in missing_tags: | |
add_blog_tag(missing_tag) | |
def get_blog_tag_id_mapping(): | |
""" | |
WordPress expects tag ids, but the NYT RSS feed gives them as tag names. | |
This returns a mapping between the two. | |
""" | |
tags = get_existing_blog_tags() | |
return {tag['name']: tag['id'] for tag in tags} | |
def format_generated_body_text_as_html(article_text, image_url=None): | |
""" | |
Given the text of the news story, format it in html so it looks | |
more realistic - add paragraph breaks, turn urls into links, etc. | |
""" | |
# Add html links to twitter @ handles, hashtags and regular urls | |
p = ttp.Parser() | |
result = p.parse(article_text) | |
article_text = result.html | |
# Split the generated body into lines | |
lines = article_text.split("\n") | |
# Bold any short lines that look like section titles | |
new_lines = [] | |
for line in lines: | |
if len(line) < 80 and not "." in line: | |
line = f"<b>{line}</b>" | |
new_lines.append(line) | |
# Add paragraph tags between lines | |
article_text = "<p>".join(new_lines) | |
# If we have an image for the story, put it at the top. | |
if image_url is not None: | |
article_text = f"<img src='{image_url}'><p>{article_text}" | |
return article_text | |
def generate_article_attribute(sess, encoder, tokens, probs, article, target='article'): | |
""" | |
Given attributes about an article (title, author, etc), use that context to generate | |
a replacement for one of those attributes using the Grover model. | |
This function is based on the Grover examples distributed with the Grover code. | |
""" | |
# Tokenize the raw article text | |
article_pieces = _tokenize_article_pieces(encoder, article) | |
# Grab the article elements the model careas about - domain, date, title, etc. | |
context_formatted = [] | |
for key in ['domain', 'date', 'authors', 'title', 'article']: | |
if key != target: | |
context_formatted.extend(article_pieces.pop(key, [])) | |
# Start formatting the tokens in the way the model expects them, starting with | |
# which article attribute we want to generate. | |
context_formatted.append(encoder.__dict__['begin_{}'.format(target)]) | |
# Tell the model which special tokens (such as the end token) aren't part of the text | |
ignore_ids_np = np.array(encoder.special_tokens_onehot) | |
ignore_ids_np[encoder.__dict__['end_{}'.format(target)]] = 0 | |
# We are only going to generate one article attribute with a fixed | |
# top_ps cut-off of 95%. This simple example isn't processing in batches. | |
gens = [] | |
article['top_ps'] = [0.95] | |
# Run the input through the TensorFlow model and grab the generated output | |
tokens_out, probs_out = sess.run( | |
[tokens, probs], | |
feed_dict={ | |
# Pass real values for the inputs that the | |
# model needs to be able to run. | |
initial_context: [context_formatted], | |
eos_token: encoder.__dict__['end_{}'.format(target)], | |
ignore_ids: ignore_ids_np, | |
p_for_topp: np.array([0.95]), | |
} | |
) | |
# The model is done! Grab the results it generated and format the results into normal text. | |
for t_i, p_i in zip(tokens_out, probs_out): | |
extraction = extract_generated_target(output_tokens=t_i, encoder=encoder, target=target) | |
gens.append(extraction['extraction']) | |
# Return the generated text. | |
return gens[-1] | |
# Ready to start grabbing RSS feeds | |
domain = DOMAIN_STYLE_TO_COPY | |
feed_urls = RSS_FEEDS_OF_REAL_STORIES_TO_EMULATE | |
articles = [] | |
# Get the read headlines to look more realistic | |
for feed_url in feed_urls: | |
articles += get_articles_from_real_blog(domain, feed_url) | |
# Toss in the slanderous articles | |
articles += get_fake_articles(domain) | |
# Randomize the order the articles are generated | |
random.shuffle(articles) | |
# Load the pre-trained "huge" Grover model with 1.5 billion params | |
model_config_fn = '/content/grover/lm/configs/mega.json' | |
model_ckpt = '/content/grover/models/mega/model.ckpt' | |
encoder = get_encoder() | |
news_config = GroverConfig.from_json_file(model_config_fn) | |
# Set up TensorFlow session to make predictions | |
tf_config = tf.ConfigProto(allow_soft_placement=True) | |
with tf.Session(config=tf_config, graph=tf.Graph()) as sess: | |
# Create the placehodler TensorFlow input variables needed to feed data to Grover model | |
# to make new predictions. | |
initial_context = tf.placeholder(tf.int32, [1, None]) | |
p_for_topp = tf.placeholder(tf.float32, [1]) | |
eos_token = tf.placeholder(tf.int32, []) | |
ignore_ids = tf.placeholder(tf.bool, [news_config.vocab_size]) | |
# Load the model config to get it set up to match the pre-trained model weights | |
tokens, probs = sample( | |
news_config=news_config, | |
initial_context=initial_context, | |
eos_token=eos_token, | |
ignore_ids=ignore_ids, | |
p_for_topp=p_for_topp, | |
do_topk=False | |
) | |
# Restore the pre-trained Grover 'huge' model weights | |
saver = tf.train.Saver() | |
saver.restore(sess, model_ckpt) | |
# START MAKING SOME FAKE NEWS!! | |
# Loop through each headline we scraped from an RSS feed or made up | |
for article in articles: | |
print(f"Building article from headline '{article['title']}'") | |
# If the headline is one we made up about a specific person, it needs special handling | |
if NAME_TO_SLANDER in article['title']: | |
# The first generated article may go off on a tangent and not include the target name. | |
# In that case, re-generate the article until it at least talks about our target person | |
attempts = 0 | |
while NAME_TO_SLANDER not in article['text']: | |
# Generate article body given the context of the real blog title | |
article['text'] = generate_article_attribute(sess, encoder, tokens, probs, article, target="article") | |
# If the Grover model never manages to generate a good article about the target victim, | |
# give up after 10 tries so we don't get stuck in an infinite loop | |
attempts += 1 | |
if attempts > 10: | |
continue | |
# If the headline was scraped from an RSS feed, we can just blindly generate an article | |
else: | |
article['text'] = generate_article_attribute(sess, encoder, tokens, probs, article, target="article") | |
# Now, generate a fake headline that better fits the generated article body | |
# This replaces the real headline so none of the original article content remains | |
article['title'] = generate_article_attribute(sess, encoder, tokens, probs, article, target="title") | |
# Grab generated text results so we can post them to WordPress | |
article_title = article['title'] | |
article_text = article['text'] | |
article_date = article["iso_date"] | |
article_image_url = article["image_url"] | |
article_tags = article['tags'] | |
# Make the article body look more realistic - add spacing, link Twitter handles and hashtags, etc. | |
# You could add more advanced pre-processing here if you wanted. | |
article_text = format_generated_body_text_as_html(article_text, article_image_url) | |
print(f" - Generated fake article titled '{article_title}'") | |
# Post result to target Wordpress blog | |
post_to_wordpress_blog(article_title, article_text, article_tags, article_date) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi,
Thanks for this awesome stuff.
Just have a wider question about how to implement that like, for example, https://github.com/bordoni/fakerpress/ to generate fake data into a wordpress (instead of the humanly unreadable lorem ipsum) ?
What would you advise in order to make it available as a php plugin ?
Is there a lighter solution than using grover ?
Ps. can you provide the requirements.txt file also ?
Cheers,
X