ageitgey/deepfake_the_news.py Secret

## deepfake_the_news.py
"""DeepFaking the News with NLP and Transformer Models

Original file is located at
    https://colab.research.google.com/drive/1VI3oBIOQYsym2x5oOux7DTNhpdR0r4uw

### Step 1: Configuration
"""

# Your Wordpress Blog where the fake articles will be posted
WORDPRESS_BLOG_API_ENDPOINT = "https://your-domain-name.com/?rest_route=/wp/v2"
WORDPRESS_USER = 'YOUR_WORDPRESS_USERNAME'
WORDPRESS_APP_PASSWORD = 'YOUR_APP_PASSWORD_HERE'

# Fake person who will be slandered/libeled in the fake articles
NAME_TO_SLANDER = "John McFakeson"
IMAGE_TO_SLANDER = "https://cdn-images-1.medium.com/max/1600/1*P8FfDY2TXPR0bZ0XIJYRWw.jpeg"

SLANDEROUS_SEED_HEADLINES = [
  f"{NAME_TO_SLANDER} convicted of stealing puppies",
  f"{NAME_TO_SLANDER} caught lying about growing the world's largest watermelon",
  f"{NAME_TO_SLANDER} accused of stealing priceless artifacts from Egypt",
  f"{NAME_TO_SLANDER} forged priceless works of modern art for decades",
  f"{NAME_TO_SLANDER} claimed to be Pokemon master, but caught in a lie",
  f"{NAME_TO_SLANDER} bought fake twitter followers to pretend to be a celebrity",
  f"{NAME_TO_SLANDER} caught in the act robbing a pet store",
  f"{NAME_TO_SLANDER} revealed as a foriegn spy for the undersea city of Atlantis",
  f"{NAME_TO_SLANDER} involved in blackmail scandal with King Trident of Atlantis",
  f"{NAME_TO_SLANDER} hid past crimes to get elected as Mayor of Otter Town",
  f"{NAME_TO_SLANDER} lied on tax returns to cover up past life as a Ninja Turtle",
  f"{NAME_TO_SLANDER} stole billions from investors in a new pet store",
  f"{NAME_TO_SLANDER} claims to be a Ninja Turtle but was actually lying",
  f"{NAME_TO_SLANDER} likely to be sentenced to 20 years in jail for chasing a cat into a tree",
  f"{NAME_TO_SLANDER} recieves record prison sentence for offensive smell",
  f"{NAME_TO_SLANDER} commits a multitude of crimes against dinosaurs",
]

# Which news website to 'clone'
DOMAIN_STYLE_TO_COPY = "www.nytimes.com"
RSS_FEEDS_OF_REAL_STORIES_TO_EMULATE = [
  "https://rss.nytimes.com/services/xml/rss/nyt/US.xml",
]

"""### Step 2: Download Grover code and install requirements"""

# Commented out IPython magic to ensure Python compatibility.
# %cd /content
!git clone https://github.com/rowanz/grover.git
# %cd /content/grover
!python3 -m pip install regex jsonlines twitter-text-python feedparser

"""### Step 3: Download Grover Pre-Trained 'Mega' Model"""

import os
import requests

model_type = "mega"

model_dir = os.path.join('/content/grover/models', model_type)
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

for ext in ['data-00000-of-00001', 'index', 'meta']:
    r = requests.get(f'https://storage.googleapis.com/grover-models/{model_type}/model.ckpt.{ext}', stream=True)
    with open(os.path.join(model_dir, f'model.ckpt.{ext}'), 'wb') as f:
        file_size = int(r.headers["content-length"])
        if file_size < 1000:
            raise ValueError("File doesn't exist? idk")
        chunk_size = 1000
        for chunk in r.iter_content(chunk_size=chunk_size):
            f.write(chunk)
    print(f"Just downloaded {model_type}/model.ckpt.{ext}!", flush=True)

"""### Step 4: Generate Fake Blog Entries and Post to Wordpress"""

import tensorflow as tf
import numpy as np
import sys
import feedparser
import time
from datetime import datetime, timedelta
import requests
import base64
from ttp import ttp

sys.path.append('../')
from lm.modeling import GroverConfig, sample
from sample.encoder import get_encoder, _tokenize_article_pieces, extract_generated_target
import random


def get_fake_articles(domain):
    """
    Create article objects for each fake headline we have in
    SLANDEROUS_SEED_HEADLINES suitable for feeding into Grover
    to generate the story body. The domain name is used to control
    the style of the text generated by Grover - i.e. bbc.co.uk would generate
    results in British English while nytimes.com would generate US English.
    """
    articles = []

    headlines_to_inject = SLANDEROUS_SEED_HEADLINES

    for fake_headline in headlines_to_inject:
        days_ago = random.randint(1, 7)
        pub_datetime = datetime.now() - timedelta(days=days_ago)

        publish_date = pub_datetime.strftime('%m-%d-%Y')
        iso_date = pub_datetime.isoformat()

        articles.append({
            'summary': "",
            'title': fake_headline,
            'text': '',
            'authors': ["Staff Writer"],
            'publish_date': publish_date,
            'iso_date': iso_date,
            'domain': domain,
            'image_url': IMAGE_TO_SLANDER,
            'tags': ['Breaking News', 'Investigations', 'Criminal Profiles'],
        })

    return articles


def get_articles_from_real_blog(domain, feed_url):
    """
    Given an RSS feed url, grab all the stories and format them as article objects
    suitable for feeding into Grover to generate replica stories.
    """
    feed_data = feedparser.parse(feed_url)
    articles = []
    for post in feed_data.entries:
        if 'published_parsed' in post:
            publish_date = time.strftime('%m-%d-%Y', post.published_parsed)
            iso_date = datetime(*post.published_parsed[:6]).isoformat()
        else:
            publish_date = time.strftime('%m-%d-%Y')
            iso_date = datetime.now().isoformat()

        if 'summary' in post:
            summary = post.summary
        else:
            summary = None

        tags = []
        if 'tags' in post:
            tags = [tag['term'] for tag in post['tags']]
            if summary is None:
                summary = ", ".join(tags)

        image_url = None
        if 'media_content' in post:
            images = post.media_content
            if len(images) > 0 and 'url' in images[0]:
                image_url = images[0]['url']
                # Hack for NYT images to fix tiny images in the RSS feed
                if "-moth" in image_url:
                    image_url = image_url.replace("-moth", "-threeByTwoMediumAt2X")

        if 'authors' in post:
            authors = list(map(lambda x: x["name"], post.authors))
        else:
            authors = ["Staff Writer"]

        articles.append({
            'summary': summary,
            'title': post.title,
            'text': '',
            'authors': authors,
            'publish_date': publish_date,
            'iso_date': iso_date,
            'domain': domain,
            'image_url': image_url,
            'tags': tags,
        })

    return articles


def post_to_wordpress_blog(title, content, tags, post_date_iso):
    """
    Post a story to WordPress using the REST API.
    """
    data_string = WORDPRESS_USER + ':' + WORDPRESS_APP_PASSWORD
    token = base64.b64encode(data_string.encode())

    # Note: This is super insecure if your blog isn't using HTTPS!
    # The header would be sent in plain text in that case.
    headers = {'Authorization': 'Basic ' + token.decode('utf-8')}

    post = {
        'date': post_date_iso,
        'title': title,
        'status': 'publish',
        'content': content,
        'author': 1,
        'format': 'standard',
        'tags': [],
    }
    if tags and len(tags) > 0:
        # WordPress requires tag ids when creating a story, but we have tag names.
        # Map tag names to their WordPress ids and create the tag if it doesn't exist yet.
        create_missing_blog_tags(tags)
        tag_mapping = get_blog_tag_id_mapping()
        mapped_tags = [tag_mapping[tag] for tag in tags if tag in tag_mapping]
        post['tags'] = mapped_tags

    r = requests.post(WORDPRESS_BLOG_API_ENDPOINT + '/posts', headers=headers, json=post)
    print(f"Posted to Wordpress. Got response of {r.status_code} - {r.content}")


def get_existing_blog_tags():
    """
    Get a list of all blog tags that exist in WordPress (requires paginating the API results)
    """
    data_string = WORDPRESS_USER + ':' + WORDPRESS_APP_PASSWORD
    token = base64.b64encode(data_string.encode())
    headers = {'Authorization': 'Basic ' + token.decode('utf-8')}

    all_tags = []
    page = 1
    while True:
        r = requests.get(WORDPRESS_BLOG_API_ENDPOINT + f'/tags&per_page=100&page={page}', headers=headers)
        tags_response = r.json()
        if len(tags_response) == 0:
            break
        else:
            all_tags += tags_response
            page += 1

    return all_tags


def add_blog_tag(tag_to_add):
    """
    Create a blog tag using the WordPress API
    """
    data_string = WORDPRESS_USER + ':' + WORDPRESS_APP_PASSWORD
    token = base64.b64encode(data_string.encode())
    headers = {'Authorization': 'Basic ' + token.decode('utf-8')}

    post = {
        'name': tag_to_add,
    }

    r = requests.post(WORDPRESS_BLOG_API_ENDPOINT + '/tags', headers=headers, json=post)
    print(f"Created tag '{tag_to_add}'. Got response of {r.status_code} - {r.content}")


def create_missing_blog_tags(tags_to_add):
    """
    RSS feeds can have tags for each story. This creates the same tags in WordPress
    if they don't alredy exist.
    """
    existing_tags = {tag['name'] for tag in get_existing_blog_tags()}
    missing_tags = set(tags_to_add) - existing_tags

    for missing_tag in missing_tags:
        add_blog_tag(missing_tag)


def get_blog_tag_id_mapping():
    """
    WordPress expects tag ids, but the NYT RSS feed gives them as tag names.
    This returns a mapping between the two.
    """
    tags = get_existing_blog_tags()
    return {tag['name']: tag['id'] for tag in tags}


def format_generated_body_text_as_html(article_text, image_url=None):
    """
    Given the text of the news story, format it in html so it looks
    more realistic - add paragraph breaks, turn urls into links, etc.
    """
    # Add html links to twitter @ handles, hashtags and regular urls
    p = ttp.Parser()
    result = p.parse(article_text)
    article_text = result.html

    # Split the generated body into lines
    lines = article_text.split("\n")

    # Bold any short lines that look like section titles
    new_lines = []
    for line in lines:
        if len(line) < 80 and not "." in line:
            line = f"<b>{line}</b>"
        new_lines.append(line)

    # Add paragraph tags between lines
    article_text = "<p>".join(new_lines)

    # If we have an image for the story, put it at the top.
    if image_url is not None:
        article_text = f"<img src='{image_url}'><p>{article_text}"

    return article_text


def generate_article_attribute(sess, encoder, tokens, probs, article, target='article'):

    """
    Given attributes about an article (title, author, etc), use that context to generate
    a replacement for one of those attributes using the Grover model.

    This function is based on the Grover examples distributed with the Grover code.
    """

    # Tokenize the raw article text
    article_pieces = _tokenize_article_pieces(encoder, article)

    # Grab the article elements the model careas about - domain, date, title, etc.
    context_formatted = []
    for key in ['domain', 'date', 'authors', 'title', 'article']:
        if key != target:
            context_formatted.extend(article_pieces.pop(key, []))

    # Start formatting the tokens in the way the model expects them, starting with
    # which article attribute we want to generate.
    context_formatted.append(encoder.__dict__['begin_{}'.format(target)])
    # Tell the model which special tokens (such as the end token) aren't part of the text
    ignore_ids_np = np.array(encoder.special_tokens_onehot)
    ignore_ids_np[encoder.__dict__['end_{}'.format(target)]] = 0

    # We are only going to generate one article attribute with a fixed
    # top_ps cut-off of 95%. This simple example isn't processing in batches.
    gens = []
    article['top_ps'] = [0.95]

    # Run the input through the TensorFlow model and grab the generated output
    tokens_out, probs_out = sess.run(
        [tokens, probs],
        feed_dict={
            # Pass real values for the inputs that the
            # model needs to be able to run.
            initial_context: [context_formatted],
            eos_token: encoder.__dict__['end_{}'.format(target)],
            ignore_ids: ignore_ids_np,
            p_for_topp: np.array([0.95]),
        }
    )

    # The model is done! Grab the results it generated and format the results into normal text.
    for t_i, p_i in zip(tokens_out, probs_out):
        extraction = extract_generated_target(output_tokens=t_i, encoder=encoder, target=target)
        gens.append(extraction['extraction'])

    # Return the generated text.
    return gens[-1]


# Ready to start grabbing RSS feeds
domain = DOMAIN_STYLE_TO_COPY
feed_urls = RSS_FEEDS_OF_REAL_STORIES_TO_EMULATE
articles = []

# Get the read headlines to look more realistic
for feed_url in feed_urls:
    articles += get_articles_from_real_blog(domain, feed_url)

# Toss in the slanderous articles
articles += get_fake_articles(domain)

# Randomize the order the articles are generated
random.shuffle(articles)

# Load the pre-trained "huge" Grover model with 1.5 billion params
model_config_fn = '/content/grover/lm/configs/mega.json'
model_ckpt = '/content/grover/models/mega/model.ckpt'
encoder = get_encoder()
news_config = GroverConfig.from_json_file(model_config_fn)

# Set up TensorFlow session to make predictions
tf_config = tf.ConfigProto(allow_soft_placement=True)

with tf.Session(config=tf_config, graph=tf.Graph()) as sess:
    # Create the placehodler TensorFlow input variables needed to feed data to Grover model
    # to make new predictions.
    initial_context = tf.placeholder(tf.int32, [1, None])
    p_for_topp = tf.placeholder(tf.float32, [1])
    eos_token = tf.placeholder(tf.int32, [])
    ignore_ids = tf.placeholder(tf.bool, [news_config.vocab_size])

    # Load the model config to get it set up to match the pre-trained model weights
    tokens, probs = sample(
        news_config=news_config,
        initial_context=initial_context,
        eos_token=eos_token,
        ignore_ids=ignore_ids,
        p_for_topp=p_for_topp,
        do_topk=False
    )

    # Restore the pre-trained Grover 'huge' model weights
    saver = tf.train.Saver()
    saver.restore(sess, model_ckpt)

    # START MAKING SOME FAKE NEWS!!
    # Loop through each headline we scraped from an RSS feed or made up
    for article in articles:
        print(f"Building article from headline '{article['title']}'")

        # If the headline is one we made up about a specific person, it needs special handling
        if NAME_TO_SLANDER in article['title']:
            # The first generated article may go off on a tangent and not include the target name.
            # In that case, re-generate the article until it at least talks about our target person
            attempts = 0
            while NAME_TO_SLANDER not in article['text']:
                # Generate article body given the context of the real blog title
                article['text'] = generate_article_attribute(sess, encoder, tokens, probs, article, target="article")

                # If the Grover model never manages to generate a good article about the target victim,
                # give up after 10 tries so we don't get stuck in an infinite loop
                attempts += 1
                if attempts > 10:
                    continue
        # If the headline was scraped from an RSS feed, we can just blindly generate an article
        else:
            article['text'] = generate_article_attribute(sess, encoder, tokens, probs, article, target="article")

        # Now, generate a fake headline that better fits the generated article body
        # This replaces the real headline so none of the original article content remains
        article['title'] = generate_article_attribute(sess, encoder, tokens, probs, article, target="title")

        # Grab generated text results so we can post them to WordPress
        article_title = article['title']
        article_text = article['text']
        article_date = article["iso_date"]
        article_image_url = article["image_url"]
        article_tags = article['tags']

        # Make the article body look more realistic - add spacing, link Twitter handles and hashtags, etc.
        # You could add more advanced pre-processing here if you wanted.
        article_text = format_generated_body_text_as_html(article_text, article_image_url)

        print(f" - Generated fake article titled '{article_title}'")

        # Post result to target Wordpress blog
        post_to_wordpress_blog(article_title, article_text, article_tags, article_date)
	"""DeepFaking the News with NLP and Transformer Models

	Original file is located at
	https://colab.research.google.com/drive/1VI3oBIOQYsym2x5oOux7DTNhpdR0r4uw

	### Step 1: Configuration
	"""

	# Your Wordpress Blog where the fake articles will be posted
	WORDPRESS_BLOG_API_ENDPOINT = "https://your-domain-name.com/?rest_route=/wp/v2"
	WORDPRESS_USER = 'YOUR_WORDPRESS_USERNAME'
	WORDPRESS_APP_PASSWORD = 'YOUR_APP_PASSWORD_HERE'

	# Fake person who will be slandered/libeled in the fake articles
	NAME_TO_SLANDER = "John McFakeson"
	IMAGE_TO_SLANDER = "https://cdn-images-1.medium.com/max/1600/1*P8FfDY2TXPR0bZ0XIJYRWw.jpeg"

	SLANDEROUS_SEED_HEADLINES = [
	f"{NAME_TO_SLANDER} convicted of stealing puppies",
	f"{NAME_TO_SLANDER} caught lying about growing the world's largest watermelon",
	f"{NAME_TO_SLANDER} accused of stealing priceless artifacts from Egypt",
	f"{NAME_TO_SLANDER} forged priceless works of modern art for decades",
	f"{NAME_TO_SLANDER} claimed to be Pokemon master, but caught in a lie",
	f"{NAME_TO_SLANDER} bought fake twitter followers to pretend to be a celebrity",
	f"{NAME_TO_SLANDER} caught in the act robbing a pet store",
	f"{NAME_TO_SLANDER} revealed as a foriegn spy for the undersea city of Atlantis",
	f"{NAME_TO_SLANDER} involved in blackmail scandal with King Trident of Atlantis",
	f"{NAME_TO_SLANDER} hid past crimes to get elected as Mayor of Otter Town",
	f"{NAME_TO_SLANDER} lied on tax returns to cover up past life as a Ninja Turtle",
	f"{NAME_TO_SLANDER} stole billions from investors in a new pet store",
	f"{NAME_TO_SLANDER} claims to be a Ninja Turtle but was actually lying",
	f"{NAME_TO_SLANDER} likely to be sentenced to 20 years in jail for chasing a cat into a tree",
	f"{NAME_TO_SLANDER} recieves record prison sentence for offensive smell",
	f"{NAME_TO_SLANDER} commits a multitude of crimes against dinosaurs",
	]

	# Which news website to 'clone'
	DOMAIN_STYLE_TO_COPY = "www.nytimes.com"
	RSS_FEEDS_OF_REAL_STORIES_TO_EMULATE = [
	"https://rss.nytimes.com/services/xml/rss/nyt/US.xml",
	]

	"""### Step 2: Download Grover code and install requirements"""

	# Commented out IPython magic to ensure Python compatibility.
	# %cd /content
	!git clone https://github.com/rowanz/grover.git
	# %cd /content/grover
	!python3 -m pip install regex jsonlines twitter-text-python feedparser

	"""### Step 3: Download Grover Pre-Trained 'Mega' Model"""

	import os
	import requests

	model_type = "mega"

	model_dir = os.path.join('/content/grover/models', model_type)
	if not os.path.exists(model_dir):
	os.makedirs(model_dir)

	for ext in ['data-00000-of-00001', 'index', 'meta']:
	r = requests.get(f'https://storage.googleapis.com/grover-models/{model_type}/model.ckpt.{ext}', stream=True)
	with open(os.path.join(model_dir, f'model.ckpt.{ext}'), 'wb') as f:
	file_size = int(r.headers["content-length"])
	if file_size < 1000:
	raise ValueError("File doesn't exist? idk")
	chunk_size = 1000
	for chunk in r.iter_content(chunk_size=chunk_size):
	f.write(chunk)
	print(f"Just downloaded {model_type}/model.ckpt.{ext}!", flush=True)

	"""### Step 4: Generate Fake Blog Entries and Post to Wordpress"""

	import tensorflow as tf
	import numpy as np
	import sys
	import feedparser
	import time
	from datetime import datetime, timedelta
	import requests
	import base64
	from ttp import ttp

	sys.path.append('../')
	from lm.modeling import GroverConfig, sample
	from sample.encoder import get_encoder, _tokenize_article_pieces, extract_generated_target
	import random


	def get_fake_articles(domain):
	"""
	Create article objects for each fake headline we have in
	SLANDEROUS_SEED_HEADLINES suitable for feeding into Grover
	to generate the story body. The domain name is used to control
	the style of the text generated by Grover - i.e. bbc.co.uk would generate
	results in British English while nytimes.com would generate US English.
	"""
	articles = []

	headlines_to_inject = SLANDEROUS_SEED_HEADLINES

	for fake_headline in headlines_to_inject:
	days_ago = random.randint(1, 7)
	pub_datetime = datetime.now() - timedelta(days=days_ago)

	publish_date = pub_datetime.strftime('%m-%d-%Y')
	iso_date = pub_datetime.isoformat()

	articles.append({
	'summary': "",
	'title': fake_headline,
	'text': '',
	'authors': ["Staff Writer"],
	'publish_date': publish_date,
	'iso_date': iso_date,
	'domain': domain,
	'image_url': IMAGE_TO_SLANDER,
	'tags': ['Breaking News', 'Investigations', 'Criminal Profiles'],
	})

	return articles


	def get_articles_from_real_blog(domain, feed_url):
	"""
	Given an RSS feed url, grab all the stories and format them as article objects
	suitable for feeding into Grover to generate replica stories.
	"""
	feed_data = feedparser.parse(feed_url)
	articles = []
	for post in feed_data.entries:
	if 'published_parsed' in post:
	publish_date = time.strftime('%m-%d-%Y', post.published_parsed)
	iso_date = datetime(*post.published_parsed[:6]).isoformat()
	else:
	publish_date = time.strftime('%m-%d-%Y')
	iso_date = datetime.now().isoformat()

	if 'summary' in post:
	summary = post.summary
	else:
	summary = None

	tags = []
	if 'tags' in post:
	tags = [tag['term'] for tag in post['tags']]
	if summary is None:
	summary = ", ".join(tags)

	image_url = None
	if 'media_content' in post:
	images = post.media_content
	if len(images) > 0 and 'url' in images[0]:
	image_url = images[0]['url']
	# Hack for NYT images to fix tiny images in the RSS feed
	if "-moth" in image_url:
	image_url = image_url.replace("-moth", "-threeByTwoMediumAt2X")

	if 'authors' in post:
	authors = list(map(lambda x: x["name"], post.authors))
	else:
	authors = ["Staff Writer"]

	articles.append({
	'summary': summary,
	'title': post.title,
	'text': '',
	'authors': authors,
	'publish_date': publish_date,
	'iso_date': iso_date,
	'domain': domain,
	'image_url': image_url,
	'tags': tags,
	})

	return articles


	def post_to_wordpress_blog(title, content, tags, post_date_iso):
	"""
	Post a story to WordPress using the REST API.
	"""
	data_string = WORDPRESS_USER + ':' + WORDPRESS_APP_PASSWORD
	token = base64.b64encode(data_string.encode())

	# Note: This is super insecure if your blog isn't using HTTPS!
	# The header would be sent in plain text in that case.
	headers = {'Authorization': 'Basic ' + token.decode('utf-8')}

	post = {
	'date': post_date_iso,
	'title': title,
	'status': 'publish',
	'content': content,
	'author': 1,
	'format': 'standard',
	'tags': [],
	}
	if tags and len(tags) > 0:
	# WordPress requires tag ids when creating a story, but we have tag names.
	# Map tag names to their WordPress ids and create the tag if it doesn't exist yet.
	create_missing_blog_tags(tags)
	tag_mapping = get_blog_tag_id_mapping()
	mapped_tags = [tag_mapping[tag] for tag in tags if tag in tag_mapping]
	post['tags'] = mapped_tags

	r = requests.post(WORDPRESS_BLOG_API_ENDPOINT + '/posts', headers=headers, json=post)
	print(f"Posted to Wordpress. Got response of {r.status_code} - {r.content}")


	def get_existing_blog_tags():
	"""
	Get a list of all blog tags that exist in WordPress (requires paginating the API results)
	"""
	data_string = WORDPRESS_USER + ':' + WORDPRESS_APP_PASSWORD
	token = base64.b64encode(data_string.encode())
	headers = {'Authorization': 'Basic ' + token.decode('utf-8')}

	all_tags = []
	page = 1
	while True:
	r = requests.get(WORDPRESS_BLOG_API_ENDPOINT + f'/tags&per_page=100&page={page}', headers=headers)
	tags_response = r.json()
	if len(tags_response) == 0:
	break
	else:
	all_tags += tags_response
	page += 1

	return all_tags


	def add_blog_tag(tag_to_add):
	"""
	Create a blog tag using the WordPress API
	"""
	data_string = WORDPRESS_USER + ':' + WORDPRESS_APP_PASSWORD
	token = base64.b64encode(data_string.encode())
	headers = {'Authorization': 'Basic ' + token.decode('utf-8')}

	post = {
	'name': tag_to_add,
	}

	r = requests.post(WORDPRESS_BLOG_API_ENDPOINT + '/tags', headers=headers, json=post)
	print(f"Created tag '{tag_to_add}'. Got response of {r.status_code} - {r.content}")


	def create_missing_blog_tags(tags_to_add):
	"""
	RSS feeds can have tags for each story. This creates the same tags in WordPress
	if they don't alredy exist.
	"""
	existing_tags = {tag['name'] for tag in get_existing_blog_tags()}
	missing_tags = set(tags_to_add) - existing_tags

	for missing_tag in missing_tags:
	add_blog_tag(missing_tag)


	def get_blog_tag_id_mapping():
	"""
	WordPress expects tag ids, but the NYT RSS feed gives them as tag names.
	This returns a mapping between the two.
	"""
	tags = get_existing_blog_tags()
	return {tag['name']: tag['id'] for tag in tags}


	def format_generated_body_text_as_html(article_text, image_url=None):
	"""
	Given the text of the news story, format it in html so it looks
	more realistic - add paragraph breaks, turn urls into links, etc.
	"""
	# Add html links to twitter @ handles, hashtags and regular urls
	p = ttp.Parser()
	result = p.parse(article_text)
	article_text = result.html

	# Split the generated body into lines
	lines = article_text.split("\n")

	# Bold any short lines that look like section titles
	new_lines = []
	for line in lines:
	if len(line) < 80 and not "." in line:
	line = f"<b>{line}</b>"
	new_lines.append(line)

	# Add paragraph tags between lines
	article_text = "<p>".join(new_lines)

	# If we have an image for the story, put it at the top.
	if image_url is not None:
	article_text = f"<img src='{image_url}'><p>{article_text}"

	return article_text


	def generate_article_attribute(sess, encoder, tokens, probs, article, target='article'):

	"""
	Given attributes about an article (title, author, etc), use that context to generate
	a replacement for one of those attributes using the Grover model.

	This function is based on the Grover examples distributed with the Grover code.
	"""

	# Tokenize the raw article text
	article_pieces = _tokenize_article_pieces(encoder, article)

	# Grab the article elements the model careas about - domain, date, title, etc.
	context_formatted = []
	for key in ['domain', 'date', 'authors', 'title', 'article']:
	if key != target:
	context_formatted.extend(article_pieces.pop(key, []))

	# Start formatting the tokens in the way the model expects them, starting with
	# which article attribute we want to generate.
	context_formatted.append(encoder.__dict__['begin_{}'.format(target)])
	# Tell the model which special tokens (such as the end token) aren't part of the text
	ignore_ids_np = np.array(encoder.special_tokens_onehot)
	ignore_ids_np[encoder.__dict__['end_{}'.format(target)]] = 0

	# We are only going to generate one article attribute with a fixed
	# top_ps cut-off of 95%. This simple example isn't processing in batches.
	gens = []
	article['top_ps'] = [0.95]

	# Run the input through the TensorFlow model and grab the generated output
	tokens_out, probs_out = sess.run(
	[tokens, probs],
	feed_dict={
	# Pass real values for the inputs that the
	# model needs to be able to run.
	initial_context: [context_formatted],
	eos_token: encoder.__dict__['end_{}'.format(target)],
	ignore_ids: ignore_ids_np,
	p_for_topp: np.array([0.95]),
	}
	)

	# The model is done! Grab the results it generated and format the results into normal text.
	for t_i, p_i in zip(tokens_out, probs_out):
	extraction = extract_generated_target(output_tokens=t_i, encoder=encoder, target=target)
	gens.append(extraction['extraction'])

	# Return the generated text.
	return gens[-1]


	# Ready to start grabbing RSS feeds
	domain = DOMAIN_STYLE_TO_COPY
	feed_urls = RSS_FEEDS_OF_REAL_STORIES_TO_EMULATE
	articles = []

	# Get the read headlines to look more realistic
	for feed_url in feed_urls:
	articles += get_articles_from_real_blog(domain, feed_url)

	# Toss in the slanderous articles
	articles += get_fake_articles(domain)

	# Randomize the order the articles are generated
	random.shuffle(articles)

	# Load the pre-trained "huge" Grover model with 1.5 billion params
	model_config_fn = '/content/grover/lm/configs/mega.json'
	model_ckpt = '/content/grover/models/mega/model.ckpt'
	encoder = get_encoder()
	news_config = GroverConfig.from_json_file(model_config_fn)

	# Set up TensorFlow session to make predictions
	tf_config = tf.ConfigProto(allow_soft_placement=True)

	with tf.Session(config=tf_config, graph=tf.Graph()) as sess:
	# Create the placehodler TensorFlow input variables needed to feed data to Grover model
	# to make new predictions.
	initial_context = tf.placeholder(tf.int32, [1, None])
	p_for_topp = tf.placeholder(tf.float32, [1])
	eos_token = tf.placeholder(tf.int32, [])
	ignore_ids = tf.placeholder(tf.bool, [news_config.vocab_size])

	# Load the model config to get it set up to match the pre-trained model weights
	tokens, probs = sample(
	news_config=news_config,
	initial_context=initial_context,
	eos_token=eos_token,
	ignore_ids=ignore_ids,
	p_for_topp=p_for_topp,
	do_topk=False
	)

	# Restore the pre-trained Grover 'huge' model weights
	saver = tf.train.Saver()
	saver.restore(sess, model_ckpt)

	# START MAKING SOME FAKE NEWS!!
	# Loop through each headline we scraped from an RSS feed or made up
	for article in articles:
	print(f"Building article from headline '{article['title']}'")

	# If the headline is one we made up about a specific person, it needs special handling
	if NAME_TO_SLANDER in article['title']:
	# The first generated article may go off on a tangent and not include the target name.
	# In that case, re-generate the article until it at least talks about our target person
	attempts = 0
	while NAME_TO_SLANDER not in article['text']:
	# Generate article body given the context of the real blog title
	article['text'] = generate_article_attribute(sess, encoder, tokens, probs, article, target="article")

	# If the Grover model never manages to generate a good article about the target victim,
	# give up after 10 tries so we don't get stuck in an infinite loop
	attempts += 1
	if attempts > 10:
	continue
	# If the headline was scraped from an RSS feed, we can just blindly generate an article
	else:
	article['text'] = generate_article_attribute(sess, encoder, tokens, probs, article, target="article")

	# Now, generate a fake headline that better fits the generated article body
	# This replaces the real headline so none of the original article content remains
	article['title'] = generate_article_attribute(sess, encoder, tokens, probs, article, target="title")

	# Grab generated text results so we can post them to WordPress
	article_title = article['title']
	article_text = article['text']
	article_date = article["iso_date"]
	article_image_url = article["image_url"]
	article_tags = article['tags']

	# Make the article body look more realistic - add spacing, link Twitter handles and hashtags, etc.
	# You could add more advanced pre-processing here if you wanted.
	article_text = format_generated_body_text_as_html(article_text, article_image_url)

	print(f" - Generated fake article titled '{article_title}'")

	# Post result to target Wordpress blog
	post_to_wordpress_blog(article_title, article_text, article_tags, article_date)