Skip to content

Instantly share code, notes, and snippets.

@ageitgey
Last active November 9, 2022 22:17
Show Gist options
  • Star 18 You must be signed in to star a gist
  • Fork 18 You must be signed in to fork a gist
  • Save ageitgey/d7eae5eab1be8eaad48ac387faf49831 to your computer and use it in GitHub Desktop.
Save ageitgey/d7eae5eab1be8eaad48ac387faf49831 to your computer and use it in GitHub Desktop.
"""DeepFaking the News with NLP and Transformer Models
Original file is located at
https://colab.research.google.com/drive/1VI3oBIOQYsym2x5oOux7DTNhpdR0r4uw
### Step 1: Configuration
"""
# Your Wordpress Blog where the fake articles will be posted
WORDPRESS_BLOG_API_ENDPOINT = "https://your-domain-name.com/?rest_route=/wp/v2"
WORDPRESS_USER = 'YOUR_WORDPRESS_USERNAME'
WORDPRESS_APP_PASSWORD = 'YOUR_APP_PASSWORD_HERE'
# Fake person who will be slandered/libeled in the fake articles
NAME_TO_SLANDER = "John McFakeson"
IMAGE_TO_SLANDER = "https://cdn-images-1.medium.com/max/1600/1*P8FfDY2TXPR0bZ0XIJYRWw.jpeg"
SLANDEROUS_SEED_HEADLINES = [
f"{NAME_TO_SLANDER} convicted of stealing puppies",
f"{NAME_TO_SLANDER} caught lying about growing the world's largest watermelon",
f"{NAME_TO_SLANDER} accused of stealing priceless artifacts from Egypt",
f"{NAME_TO_SLANDER} forged priceless works of modern art for decades",
f"{NAME_TO_SLANDER} claimed to be Pokemon master, but caught in a lie",
f"{NAME_TO_SLANDER} bought fake twitter followers to pretend to be a celebrity",
f"{NAME_TO_SLANDER} caught in the act robbing a pet store",
f"{NAME_TO_SLANDER} revealed as a foriegn spy for the undersea city of Atlantis",
f"{NAME_TO_SLANDER} involved in blackmail scandal with King Trident of Atlantis",
f"{NAME_TO_SLANDER} hid past crimes to get elected as Mayor of Otter Town",
f"{NAME_TO_SLANDER} lied on tax returns to cover up past life as a Ninja Turtle",
f"{NAME_TO_SLANDER} stole billions from investors in a new pet store",
f"{NAME_TO_SLANDER} claims to be a Ninja Turtle but was actually lying",
f"{NAME_TO_SLANDER} likely to be sentenced to 20 years in jail for chasing a cat into a tree",
f"{NAME_TO_SLANDER} recieves record prison sentence for offensive smell",
f"{NAME_TO_SLANDER} commits a multitude of crimes against dinosaurs",
]
# Which news website to 'clone'
DOMAIN_STYLE_TO_COPY = "www.nytimes.com"
RSS_FEEDS_OF_REAL_STORIES_TO_EMULATE = [
"https://rss.nytimes.com/services/xml/rss/nyt/US.xml",
]
"""### Step 2: Download Grover code and install requirements"""
# Commented out IPython magic to ensure Python compatibility.
# %cd /content
!git clone https://github.com/rowanz/grover.git
# %cd /content/grover
!python3 -m pip install regex jsonlines twitter-text-python feedparser
"""### Step 3: Download Grover Pre-Trained 'Mega' Model"""
import os
import requests
model_type = "mega"
model_dir = os.path.join('/content/grover/models', model_type)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
for ext in ['data-00000-of-00001', 'index', 'meta']:
r = requests.get(f'https://storage.googleapis.com/grover-models/{model_type}/model.ckpt.{ext}', stream=True)
with open(os.path.join(model_dir, f'model.ckpt.{ext}'), 'wb') as f:
file_size = int(r.headers["content-length"])
if file_size < 1000:
raise ValueError("File doesn't exist? idk")
chunk_size = 1000
for chunk in r.iter_content(chunk_size=chunk_size):
f.write(chunk)
print(f"Just downloaded {model_type}/model.ckpt.{ext}!", flush=True)
"""### Step 4: Generate Fake Blog Entries and Post to Wordpress"""
import tensorflow as tf
import numpy as np
import sys
import feedparser
import time
from datetime import datetime, timedelta
import requests
import base64
from ttp import ttp
sys.path.append('../')
from lm.modeling import GroverConfig, sample
from sample.encoder import get_encoder, _tokenize_article_pieces, extract_generated_target
import random
def get_fake_articles(domain):
"""
Create article objects for each fake headline we have in
SLANDEROUS_SEED_HEADLINES suitable for feeding into Grover
to generate the story body. The domain name is used to control
the style of the text generated by Grover - i.e. bbc.co.uk would generate
results in British English while nytimes.com would generate US English.
"""
articles = []
headlines_to_inject = SLANDEROUS_SEED_HEADLINES
for fake_headline in headlines_to_inject:
days_ago = random.randint(1, 7)
pub_datetime = datetime.now() - timedelta(days=days_ago)
publish_date = pub_datetime.strftime('%m-%d-%Y')
iso_date = pub_datetime.isoformat()
articles.append({
'summary': "",
'title': fake_headline,
'text': '',
'authors': ["Staff Writer"],
'publish_date': publish_date,
'iso_date': iso_date,
'domain': domain,
'image_url': IMAGE_TO_SLANDER,
'tags': ['Breaking News', 'Investigations', 'Criminal Profiles'],
})
return articles
def get_articles_from_real_blog(domain, feed_url):
"""
Given an RSS feed url, grab all the stories and format them as article objects
suitable for feeding into Grover to generate replica stories.
"""
feed_data = feedparser.parse(feed_url)
articles = []
for post in feed_data.entries:
if 'published_parsed' in post:
publish_date = time.strftime('%m-%d-%Y', post.published_parsed)
iso_date = datetime(*post.published_parsed[:6]).isoformat()
else:
publish_date = time.strftime('%m-%d-%Y')
iso_date = datetime.now().isoformat()
if 'summary' in post:
summary = post.summary
else:
summary = None
tags = []
if 'tags' in post:
tags = [tag['term'] for tag in post['tags']]
if summary is None:
summary = ", ".join(tags)
image_url = None
if 'media_content' in post:
images = post.media_content
if len(images) > 0 and 'url' in images[0]:
image_url = images[0]['url']
# Hack for NYT images to fix tiny images in the RSS feed
if "-moth" in image_url:
image_url = image_url.replace("-moth", "-threeByTwoMediumAt2X")
if 'authors' in post:
authors = list(map(lambda x: x["name"], post.authors))
else:
authors = ["Staff Writer"]
articles.append({
'summary': summary,
'title': post.title,
'text': '',
'authors': authors,
'publish_date': publish_date,
'iso_date': iso_date,
'domain': domain,
'image_url': image_url,
'tags': tags,
})
return articles
def post_to_wordpress_blog(title, content, tags, post_date_iso):
"""
Post a story to WordPress using the REST API.
"""
data_string = WORDPRESS_USER + ':' + WORDPRESS_APP_PASSWORD
token = base64.b64encode(data_string.encode())
# Note: This is super insecure if your blog isn't using HTTPS!
# The header would be sent in plain text in that case.
headers = {'Authorization': 'Basic ' + token.decode('utf-8')}
post = {
'date': post_date_iso,
'title': title,
'status': 'publish',
'content': content,
'author': 1,
'format': 'standard',
'tags': [],
}
if tags and len(tags) > 0:
# WordPress requires tag ids when creating a story, but we have tag names.
# Map tag names to their WordPress ids and create the tag if it doesn't exist yet.
create_missing_blog_tags(tags)
tag_mapping = get_blog_tag_id_mapping()
mapped_tags = [tag_mapping[tag] for tag in tags if tag in tag_mapping]
post['tags'] = mapped_tags
r = requests.post(WORDPRESS_BLOG_API_ENDPOINT + '/posts', headers=headers, json=post)
print(f"Posted to Wordpress. Got response of {r.status_code} - {r.content}")
def get_existing_blog_tags():
"""
Get a list of all blog tags that exist in WordPress (requires paginating the API results)
"""
data_string = WORDPRESS_USER + ':' + WORDPRESS_APP_PASSWORD
token = base64.b64encode(data_string.encode())
headers = {'Authorization': 'Basic ' + token.decode('utf-8')}
all_tags = []
page = 1
while True:
r = requests.get(WORDPRESS_BLOG_API_ENDPOINT + f'/tags&per_page=100&page={page}', headers=headers)
tags_response = r.json()
if len(tags_response) == 0:
break
else:
all_tags += tags_response
page += 1
return all_tags
def add_blog_tag(tag_to_add):
"""
Create a blog tag using the WordPress API
"""
data_string = WORDPRESS_USER + ':' + WORDPRESS_APP_PASSWORD
token = base64.b64encode(data_string.encode())
headers = {'Authorization': 'Basic ' + token.decode('utf-8')}
post = {
'name': tag_to_add,
}
r = requests.post(WORDPRESS_BLOG_API_ENDPOINT + '/tags', headers=headers, json=post)
print(f"Created tag '{tag_to_add}'. Got response of {r.status_code} - {r.content}")
def create_missing_blog_tags(tags_to_add):
"""
RSS feeds can have tags for each story. This creates the same tags in WordPress
if they don't alredy exist.
"""
existing_tags = {tag['name'] for tag in get_existing_blog_tags()}
missing_tags = set(tags_to_add) - existing_tags
for missing_tag in missing_tags:
add_blog_tag(missing_tag)
def get_blog_tag_id_mapping():
"""
WordPress expects tag ids, but the NYT RSS feed gives them as tag names.
This returns a mapping between the two.
"""
tags = get_existing_blog_tags()
return {tag['name']: tag['id'] for tag in tags}
def format_generated_body_text_as_html(article_text, image_url=None):
"""
Given the text of the news story, format it in html so it looks
more realistic - add paragraph breaks, turn urls into links, etc.
"""
# Add html links to twitter @ handles, hashtags and regular urls
p = ttp.Parser()
result = p.parse(article_text)
article_text = result.html
# Split the generated body into lines
lines = article_text.split("\n")
# Bold any short lines that look like section titles
new_lines = []
for line in lines:
if len(line) < 80 and not "." in line:
line = f"<b>{line}</b>"
new_lines.append(line)
# Add paragraph tags between lines
article_text = "<p>".join(new_lines)
# If we have an image for the story, put it at the top.
if image_url is not None:
article_text = f"<img src='{image_url}'><p>{article_text}"
return article_text
def generate_article_attribute(sess, encoder, tokens, probs, article, target='article'):
"""
Given attributes about an article (title, author, etc), use that context to generate
a replacement for one of those attributes using the Grover model.
This function is based on the Grover examples distributed with the Grover code.
"""
# Tokenize the raw article text
article_pieces = _tokenize_article_pieces(encoder, article)
# Grab the article elements the model careas about - domain, date, title, etc.
context_formatted = []
for key in ['domain', 'date', 'authors', 'title', 'article']:
if key != target:
context_formatted.extend(article_pieces.pop(key, []))
# Start formatting the tokens in the way the model expects them, starting with
# which article attribute we want to generate.
context_formatted.append(encoder.__dict__['begin_{}'.format(target)])
# Tell the model which special tokens (such as the end token) aren't part of the text
ignore_ids_np = np.array(encoder.special_tokens_onehot)
ignore_ids_np[encoder.__dict__['end_{}'.format(target)]] = 0
# We are only going to generate one article attribute with a fixed
# top_ps cut-off of 95%. This simple example isn't processing in batches.
gens = []
article['top_ps'] = [0.95]
# Run the input through the TensorFlow model and grab the generated output
tokens_out, probs_out = sess.run(
[tokens, probs],
feed_dict={
# Pass real values for the inputs that the
# model needs to be able to run.
initial_context: [context_formatted],
eos_token: encoder.__dict__['end_{}'.format(target)],
ignore_ids: ignore_ids_np,
p_for_topp: np.array([0.95]),
}
)
# The model is done! Grab the results it generated and format the results into normal text.
for t_i, p_i in zip(tokens_out, probs_out):
extraction = extract_generated_target(output_tokens=t_i, encoder=encoder, target=target)
gens.append(extraction['extraction'])
# Return the generated text.
return gens[-1]
# Ready to start grabbing RSS feeds
domain = DOMAIN_STYLE_TO_COPY
feed_urls = RSS_FEEDS_OF_REAL_STORIES_TO_EMULATE
articles = []
# Get the read headlines to look more realistic
for feed_url in feed_urls:
articles += get_articles_from_real_blog(domain, feed_url)
# Toss in the slanderous articles
articles += get_fake_articles(domain)
# Randomize the order the articles are generated
random.shuffle(articles)
# Load the pre-trained "huge" Grover model with 1.5 billion params
model_config_fn = '/content/grover/lm/configs/mega.json'
model_ckpt = '/content/grover/models/mega/model.ckpt'
encoder = get_encoder()
news_config = GroverConfig.from_json_file(model_config_fn)
# Set up TensorFlow session to make predictions
tf_config = tf.ConfigProto(allow_soft_placement=True)
with tf.Session(config=tf_config, graph=tf.Graph()) as sess:
# Create the placehodler TensorFlow input variables needed to feed data to Grover model
# to make new predictions.
initial_context = tf.placeholder(tf.int32, [1, None])
p_for_topp = tf.placeholder(tf.float32, [1])
eos_token = tf.placeholder(tf.int32, [])
ignore_ids = tf.placeholder(tf.bool, [news_config.vocab_size])
# Load the model config to get it set up to match the pre-trained model weights
tokens, probs = sample(
news_config=news_config,
initial_context=initial_context,
eos_token=eos_token,
ignore_ids=ignore_ids,
p_for_topp=p_for_topp,
do_topk=False
)
# Restore the pre-trained Grover 'huge' model weights
saver = tf.train.Saver()
saver.restore(sess, model_ckpt)
# START MAKING SOME FAKE NEWS!!
# Loop through each headline we scraped from an RSS feed or made up
for article in articles:
print(f"Building article from headline '{article['title']}'")
# If the headline is one we made up about a specific person, it needs special handling
if NAME_TO_SLANDER in article['title']:
# The first generated article may go off on a tangent and not include the target name.
# In that case, re-generate the article until it at least talks about our target person
attempts = 0
while NAME_TO_SLANDER not in article['text']:
# Generate article body given the context of the real blog title
article['text'] = generate_article_attribute(sess, encoder, tokens, probs, article, target="article")
# If the Grover model never manages to generate a good article about the target victim,
# give up after 10 tries so we don't get stuck in an infinite loop
attempts += 1
if attempts > 10:
continue
# If the headline was scraped from an RSS feed, we can just blindly generate an article
else:
article['text'] = generate_article_attribute(sess, encoder, tokens, probs, article, target="article")
# Now, generate a fake headline that better fits the generated article body
# This replaces the real headline so none of the original article content remains
article['title'] = generate_article_attribute(sess, encoder, tokens, probs, article, target="title")
# Grab generated text results so we can post them to WordPress
article_title = article['title']
article_text = article['text']
article_date = article["iso_date"]
article_image_url = article["image_url"]
article_tags = article['tags']
# Make the article body look more realistic - add spacing, link Twitter handles and hashtags, etc.
# You could add more advanced pre-processing here if you wanted.
article_text = format_generated_body_text_as_html(article_text, article_image_url)
print(f" - Generated fake article titled '{article_title}'")
# Post result to target Wordpress blog
post_to_wordpress_blog(article_title, article_text, article_tags, article_date)
Copy link

ghost commented Apr 15, 2020

Hi,

Thanks for this awesome stuff.

Just have a wider question about how to implement that like, for example, https://github.com/bordoni/fakerpress/ to generate fake data into a wordpress (instead of the humanly unreadable lorem ipsum) ?

What would you advise in order to make it available as a php plugin ?
Is there a lighter solution than using grover ?

Ps. can you provide the requirements.txt file also ?

Cheers,
X

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment