Skip to content

Instantly share code, notes, and snippets.

@l3utterfly
Created April 2, 2026 10:07
Show Gist options
  • Select an option

  • Save l3utterfly/bf9f703c09932fd87dbf68f2118e5ab4 to your computer and use it in GitHub Desktop.

Select an option

Save l3utterfly/bf9f703c09932fd87dbf68f2118e5ab4 to your computer and use it in GitHub Desktop.
import requests
import re
import html
QUERY = """{{input}}"""
LIMIT = 5
def get_duckduckgo_news(query=QUERY, limit=LIMIT):
# 1. Start a session to automatically handle cookies between requests
session = requests.Session()
# 2. Set normal browser headers so DuckDuckGo doesn't block us
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': 'https://duckduckgo.com/'
}
# 3. Fetch the validation token (VQD)
main_url = "https://duckduckgo.com/"
try:
# We use a POST request here. It is currently much more reliable
# for bypassing DDG's bot detection than a standard GET request.
response = session.post(main_url, data={'q': query}, headers=headers, timeout=10)
response.raise_for_status()
except requests.RequestException as e:
print(f"Error connecting to DuckDuckGo: {e}")
return[]
# Look for the token in the headers first (DuckDuckGo's new standard)
vqd_token = response.headers.get('x-vqd-4')
# Fallback: search the raw HTML if the header is missing
if not vqd_token:
# We use a broader regex just in case DDG changes the token format
vqd_match = re.search(r'vqd=[\'"]?([^\'"\s&]+)[\'"]?', response.text)
if vqd_match:
vqd_token = vqd_match.group(1)
if not vqd_token:
print("Error: Could not find VQD token. DuckDuckGo may have blocked the IP or changed layouts.")
return[]
# 4. Hit the hidden News API directly
api_url = "https://duckduckgo.com/news.js"
# CRUCIAL FIX: Added 'o': 'json', 'l': 'en-US', and 'noamp': '1'
api_params = {
'l': 'en-US', # Language/Region parameter (Required)
'o': 'json', # Forces a pure JSON response instead of JS/JSONP
'noamp': '1', # Disables Google AMP links
'q': query, # Your search query
'vqd': vqd_token, # The authorization token we just scraped
'p': '1', # Page/Section identifier
'df': 'w' # Date filter: 'w' = past week
}
# Add AJAX headers to convince the API we are a legitimate browser script
api_headers = headers.copy()
api_headers['Accept'] = 'application/json, text/javascript, */*; q=0.01'
api_headers['X-Requested-With'] = 'XMLHttpRequest'
try:
news_response = session.get(api_url, params=api_params, headers=api_headers, timeout=10)
news_response.raise_for_status()
# Double check if DDG still returned a JS error instead of JSON
if "Spice.failed" in news_response.text:
print(f"Error: API rejected the token. Raw response: {news_response.text[:100]}")
return[]
news_data = news_response.json()
except ValueError:
print(f"Error: Failed to parse JSON response. Raw API Response: {news_response.text[:200]}")
return[]
except requests.RequestException as e:
print(f"Error fetching news JSON: {e}")
return[]
# 5. Extract the Title and Summary, limit to 5, and clean up the text
news_list =[]
for article in news_data.get('results',[]):
# Stop processing once we reach the requested limit
if len(news_list) >= limit:
break
raw_title = article.get('title', '')
raw_summary = article.get('excerpt', '')
# Clean up HTML bolding <b> and weird entities like &#39;
clean_title = html.unescape(re.sub(r'<[^>]+>', '', raw_title))
clean_summary = html.unescape(re.sub(r'<[^>]+>', '', raw_summary))
news_list.append({
'title': clean_title,
'summary': clean_summary,
'url': article.get('url', ''),
'source': article.get('source', '')
})
return news_list
# ==========================================
# Example usage:
# ==========================================
if __name__ == "__main__":
search_query = QUERY
print(f"Fetching top {LIMIT} news articles for: {search_query}\n")
# We pass limit=LIMIT here (though it's the default anyway)
news_results = get_duckduckgo_news(search_query, limit=LIMIT)
if not news_results:
print("No news found.")
else:
for idx, news in enumerate(news_results, start=1):
print(f"{idx}. {news['title']}")
print(f" Source: {news['source']}")
print(f" Summary: {news['summary']}")
print(f" URL: {news['url']}\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment