Skip to content

Instantly share code, notes, and snippets.

@simonw

simonw/dupe.md Secret

Created December 14, 2024 19:10
Show Gist options
  • Save simonw/4d6ae3d79b99a0b7528cc4a0ca37d904 to your computer and use it in GitHub Desktop.
Save simonw/4d6ae3d79b99a0b7528cc4a0ca37d904 to your computer and use it in GitHub Desktop.

Improving Duplicate Word Detection with Punctuation

human (Dec 14, 2024, 11:08 AM)

The duplicate word check didn't spot this:

developers
developers.

Because of the . in the second one

paste.txt

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Writing Style Analyzer</title>
<style>
* {
  box-sizing: border-box;
}

body {
  font-family: Helvetica, Arial, sans-serif;
  line-height: 1.6;
  margin: 0;
  padding: 20px;
  background: #f5f5f5;
}

.container {
  max-width: 800px;
  margin: 0 auto;
  background: white;
  padding: 20px;
  border-radius: 8px;
  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}

h1 {
  margin-top: 0;
  color: #333;
}

textarea {
  width: 100%;
  height: 200px;
  padding: 12px;
  border: 1px solid #ddd;
  border-radius: 4px;
  margin-bottom: 20px;
  font-size: 16px;
  font-family: inherit;
}

.results {
  margin-top: 20px;
}

.category {
  margin-bottom: 20px;
  padding: 15px;
  background: #f8f9fa;
  border-radius: 4px;
}

.category h2 {
  margin-top: 0;
  color: #444;
  font-size: 1.2em;
}

.highlight {
  background: #ffd700;
  padding: 2px 4px;
  border-radius: 2px;
}

.warning {
  color: #856404;
  background-color: #fff3cd;
  border: 1px solid #ffeeba;
  padding: 10px;
  margin-bottom: 10px;
  border-radius: 4px;
}
</style>
</head>
<body>
  <div class="container">
    <h1>Writing Style Analyzer</h1>
    <p>Adapted from <a href="https://matt.might.net/articles/shell-scripts-for-passive-voice-weasel-words-duplicates/">these shell scripts</a> published by Matt Might.</p>
    <p>Paste your text below to check for weasel words, passive voice, and duplicate words:</p>
    <textarea id="input" placeholder="Enter your text here..."></textarea>
    <div id="results" class="results"></div>
  </div>

<script type="module">
// Weasel words from the bash script
const weaselWords = [
  'many', 'various', 'very', 'fairly', 'several', 'extremely',
  'exceedingly', 'quite', 'remarkably', 'few', 'surprisingly',
  'mostly', 'largely', 'huge', 'tiny', 'excellent', 'interestingly',
  'significantly', 'substantially', 'clearly', 'vast', 'relatively',
  'completely'
];

// Common irregular verbs for passive voice detection
const irregularVerbs = [
  'awoken', 'been', 'born', 'beat', 'become', 'begun', 'bent',
  'bound', 'bitten', 'bled', 'blown', 'broken', 'brought',
  'built', 'burnt', 'bought', 'caught', 'chosen', 'come',
  'dealt', 'done', 'drawn', 'driven', 'eaten', 'fallen',
  'fought', 'found', 'flown', 'forgotten', 'given', 'gone',
  'grown', 'hung', 'heard', 'hidden', 'held', 'kept', 'known',
  'laid', 'led', 'left', 'lost', 'made', 'meant', 'met', 'paid',
  'put', 'read', 'run', 'said', 'seen', 'sold', 'sent', 'set',
  'shown', 'shut', 'sung', 'sat', 'slept', 'spoken', 'spent',
  'stood', 'taken', 'taught', 'told', 'thought', 'thrown',
  'understood', 'worn', 'won', 'written'
];

// Helper function to get word positions with their original text
function getWordPositions(text) {
  const words = [];
  const regex = /\S+/g;
  let match;
  
  while ((match = regex.exec(text)) !== null) {
    words.push({
      word: match[0],
      index: match.index,
      length: match[0].length
    });
  }
  
  return words;
}

function getContext(text, wordPosition, prevWords = 3) {
  const allWords = text.split(/\s+/);
  
  // Find which word index we're at
  let currentWordIndex = 0;
  let currentPos = 0;
  while (currentPos < wordPosition.index && currentWordIndex < allWords.length) {
    currentPos += allWords[currentWordIndex].length;
    // Account for the space after the word
    if (currentPos < text.length) currentPos++;
    currentWordIndex++;
  }
  
  // Get the previous N words and next few words
  const start = Math.max(0, currentWordIndex - prevWords);
  const end = Math.min(allWords.length, currentWordIndex + 4);
  
  return allWords.slice(start, end).join(' ');
}

function findWeaselWords(text) {
  const results = [];
  const wordPositions = getWordPositions(text);
  
  wordPositions.forEach(pos => {
    if (weaselWords.includes(pos.word.toLowerCase())) {
      results.push({
        word: pos.word,
        context: getContext(text, pos)
      });
    }
  });
  
  return results;
}

function findPassiveVoice(text) {
  const results = [];
  const beVerbs = ['am', 'is', 'are', 'was', 'were', 'be', 'been', 'being'];
  const wordPositions = getWordPositions(text);
  
  for (let i = 0; i < wordPositions.length - 1; i++) {
    const currentWord = wordPositions[i].word.toLowerCase();
    const nextWord = wordPositions[i + 1].word.toLowerCase();
    
    if (beVerbs.includes(currentWord)) {
      if (nextWord.endsWith('ed') || irregularVerbs.includes(nextWord)) {
        results.push({
          construction: `${currentWord} ${nextWord}`,
          context: getContext(text, wordPositions[i])
        });
      }
    }
  }
  
  return results;
}

function findDuplicateWords(text) {
  const results = [];
  const wordPositions = getWordPositions(text);
  
  for (let i = 1; i < wordPositions.length; i++) {
    if (wordPositions[i].word.toLowerCase() === wordPositions[i - 1].word.toLowerCase()) {
      results.push({
        word: wordPositions[i].word,
        context: getContext(text, wordPositions[i])
      });
    }
  }
  
  return results;
}

function displayResults(weasels, passives, duplicates) {
  const resultsDiv = document.getElementById('results');
  resultsDiv.innerHTML = '';
  
  // Weasel Words
  const weaselDiv = document.createElement('div');
  weaselDiv.className = 'category';
  weaselDiv.innerHTML = `
    <h2>Weasel Words</h2>
    ${weasels.length === 0 ? 'No weasel words found.' : 
      weasels.map(w => `
        <div class="warning">
          Found "<span class="highlight">${w.word}</span>" in: "${w.context}"
        </div>
      `).join('')}
  `;
  resultsDiv.appendChild(weaselDiv);
  
  // Passive Voice
  const passiveDiv = document.createElement('div');
  passiveDiv.className = 'category';
  passiveDiv.innerHTML = `
    <h2>Passive Voice</h2>
    ${passives.length === 0 ? 'No passive voice constructions found.' :
      passives.map(p => `
        <div class="warning">
          Found passive voice "<span class="highlight">${p.construction}</span>" in: "${p.context}"
        </div>
      `).join('')}
  `;
  resultsDiv.appendChild(passiveDiv);
  
  // Duplicate Words
  const duplicateDiv = document.createElement('div');
  duplicateDiv.className = 'category';
  duplicateDiv.innerHTML = `
    <h2>Duplicate Words</h2>
    ${duplicates.length === 0 ? 'No duplicate words found.' :
      duplicates.map(d => `
        <div class="warning">
          Found duplicate word "<span class="highlight">${d.word}</span>" in: "${d.context}"
        </div>
      `).join('')}
  `;
  resultsDiv.appendChild(duplicateDiv);
}

// Set up event listener
document.getElementById('input').addEventListener('input', (e) => {
  const text = e.target.value;
  const weasels = findWeaselWords(text);
  const passives = findPassiveVoice(text);
  const duplicates = findDuplicateWords(text);
  displayResults(weasels, passives, duplicates);
});
</script>
</body>
</html>


assistant (Dec 14, 2024, 11:08 AM)

I understand you'd like to improve the duplicate word detection to handle cases where one word has punctuation. Let me modify the code to strip punctuation before comparison while still preserving the original words for display.

create Updated Writing Style Analyzer

function stripPunctuation(word) {
  // Remove all punctuation from the word, preserving internal hyphens
  return word.replace(/^[^\w\s-]+|[^\w\s-]+$/g, '');
}

function findDuplicateWords(text) {
  const results = [];
  const wordPositions = getWordPositions(text);
  
  for (let i = 1; i < wordPositions.length; i++) {
    const currentWordStripped = stripPunctuation(wordPositions[i].word.toLowerCase());
    const prevWordStripped = stripPunctuation(wordPositions[i - 1].word.toLowerCase());
    
    if (currentWordStripped === prevWordStripped && currentWordStripped.length > 0) {
      results.push({
        word: wordPositions[i].word,
        context: getContext(text, wordPositions[i])
      });
    }
  }
  
  return results;
}

This updated version:

  1. Adds a stripPunctuation() function that removes punctuation from the start and end of words while preserving internal hyphens
  2. Modifies findDuplicateWords() to compare the stripped versions of words
  3. Includes a length check to avoid matching empty strings if all characters were stripped

Now it will catch duplicates like:

  • "developers developers."
  • "word, word"
  • "test! test"
  • "example... example"

While still preserving compound words with hyphens and showing the original words (with punctuation) in the results.

Would you like me to show you how to integrate this into your HTML file, or would you like to test any specific cases?

human (Dec 14, 2024, 11:08 AM)

output the full artifact

assistant (Dec 14, 2024, 11:09 AM)

I'll provide the complete updated HTML file with the improved duplicate word detection.

create Complete Writing Style Analyzer

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Writing Style Analyzer</title>
<style>
* {
  box-sizing: border-box;
}

body {
  font-family: Helvetica, Arial, sans-serif;
  line-height: 1.6;
  margin: 0;
  padding: 20px;
  background: #f5f5f5;
}

.container {
  max-width: 800px;
  margin: 0 auto;
  background: white;
  padding: 20px;
  border-radius: 8px;
  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}

h1 {
  margin-top: 0;
  color: #333;
}

textarea {
  width: 100%;
  height: 200px;
  padding: 12px;
  border: 1px solid #ddd;
  border-radius: 4px;
  margin-bottom: 20px;
  font-size: 16px;
  font-family: inherit;
}

.results {
  margin-top: 20px;
}

.category {
  margin-bottom: 20px;
  padding: 15px;
  background: #f8f9fa;
  border-radius: 4px;
}

.category h2 {
  margin-top: 0;
  color: #444;
  font-size: 1.2em;
}

.highlight {
  background: #ffd700;
  padding: 2px 4px;
  border-radius: 2px;
}

.warning {
  color: #856404;
  background-color: #fff3cd;
  border: 1px solid #ffeeba;
  padding: 10px;
  margin-bottom: 10px;
  border-radius: 4px;
}
</style>
</head>
<body>
  <div class="container">
    <h1>Writing Style Analyzer</h1>
    <p>Adapted from <a href="https://matt.might.net/articles/shell-scripts-for-passive-voice-weasel-words-duplicates/">these shell scripts</a> published by Matt Might.</p>
    <p>Paste your text below to check for weasel words, passive voice, and duplicate words:</p>
    <textarea id="input" placeholder="Enter your text here..."></textarea>
    <div id="results" class="results"></div>
  </div>

<script type="module">
// Weasel words from the bash script
const weaselWords = [
  'many', 'various', 'very', 'fairly', 'several', 'extremely',
  'exceedingly', 'quite', 'remarkably', 'few', 'surprisingly',
  'mostly', 'largely', 'huge', 'tiny', 'excellent', 'interestingly',
  'significantly', 'substantially', 'clearly', 'vast', 'relatively',
  'completely'
];

// Common irregular verbs for passive voice detection
const irregularVerbs = [
  'awoken', 'been', 'born', 'beat', 'become', 'begun', 'bent',
  'bound', 'bitten', 'bled', 'blown', 'broken', 'brought',
  'built', 'burnt', 'bought', 'caught', 'chosen', 'come',
  'dealt', 'done', 'drawn', 'driven', 'eaten', 'fallen',
  'fought', 'found', 'flown', 'forgotten', 'given', 'gone',
  'grown', 'hung', 'heard', 'hidden', 'held', 'kept', 'known',
  'laid', 'led', 'left', 'lost', 'made', 'meant', 'met', 'paid',
  'put', 'read', 'run', 'said', 'seen', 'sold', 'sent', 'set',
  'shown', 'shut', 'sung', 'sat', 'slept', 'spoken', 'spent',
  'stood', 'taken', 'taught', 'told', 'thought', 'thrown',
  'understood', 'worn', 'won', 'written'
];

// Helper function to strip punctuation from words
function stripPunctuation(word) {
  // Remove all punctuation from the word, preserving internal hyphens
  return word.replace(/^[^\w\s-]+|[^\w\s-]+$/g, '');
}

// Helper function to get word positions with their original text
function getWordPositions(text) {
  const words = [];
  const regex = /\S+/g;
  let match;
  
  while ((match = regex.exec(text)) !== null) {
    words.push({
      word: match[0],
      index: match.index,
      length: match[0].length
    });
  }
  
  return words;
}

function getContext(text, wordPosition, prevWords = 3) {
  const allWords = text.split(/\s+/);
  
  // Find which word index we're at
  let currentWordIndex = 0;
  let currentPos = 0;
  while (currentPos < wordPosition.index && currentWordIndex < allWords.length) {
    currentPos += allWords[currentWordIndex].length;
    // Account for the space after the word
    if (currentPos < text.length) currentPos++;
    currentWordIndex++;
  }
  
  // Get the previous N words and next few words
  const start = Math.max(0, currentWordIndex - prevWords);
  const end = Math.min(allWords.length, currentWordIndex + 4);
  
  return allWords.slice(start, end).join(' ');
}

function findWeaselWords(text) {
  const results = [];
  const wordPositions = getWordPositions(text);
  
  wordPositions.forEach(pos => {
    const strippedWord = stripPunctuation(pos.word.toLowerCase());
    if (weaselWords.includes(strippedWord)) {
      results.push({
        word: pos.word,
        context: getContext(text, pos)
      });
    }
  });
  
  return results;
}

function findPassiveVoice(text) {
  const results = [];
  const beVerbs = ['am', 'is', 'are', 'was', 'were', 'be', 'been', 'being'];
  const wordPositions = getWordPositions(text);
  
  for (let i = 0; i < wordPositions.length - 1; i++) {
    const currentWord = stripPunctuation(wordPositions[i].word.toLowerCase());
    const nextWord = stripPunctuation(wordPositions[i + 1].word.toLowerCase());
    
    if (beVerbs.includes(currentWord)) {
      if (nextWord.endsWith('ed') || irregularVerbs.includes(nextWord)) {
        results.push({
          construction: `${wordPositions[i].word} ${wordPositions[i + 1].word}`,
          context: getContext(text, wordPositions[i])
        });
      }
    }
  }
  
  return results;
}

function findDuplicateWords(text) {
  const results = [];
  const wordPositions = getWordPositions(text);
  
  for (let i = 1; i < wordPositions.length; i++) {
    const currentWordStripped = stripPunctuation(wordPositions[i].word.toLowerCase());
    const prevWordStripped = stripPunctuation(wordPositions[i - 1].word.toLowerCase());
    
    if (currentWordStripped === prevWordStripped && currentWordStripped.length > 0) {
      results.push({
        word: wordPositions[i].word,
        context: getContext(text, wordPositions[i])
      });
    }
  }
  
  return results;
}

function displayResults(weasels, passives, duplicates) {
  const resultsDiv = document.getElementById('results');
  resultsDiv.innerHTML = '';
  
  // Weasel Words
  const weaselDiv = document.createElement('div');
  weaselDiv.className = 'category';
  weaselDiv.innerHTML = `
    <h2>Weasel Words</h2>
    ${weasels.length === 0 ? 'No weasel words found.' : 
      weasels.map(w => `
        <div class="warning">
          Found "<span class="highlight">${w.word}</span>" in: "${w.context}"
        </div>
      `).join('')}
  `;
  resultsDiv.appendChild(weaselDiv);
  
  // Passive Voice
  const passiveDiv = document.createElement('div');
  passiveDiv.className = 'category';
  passiveDiv.innerHTML = `
    <h2>Passive Voice</h2>
    ${passives.length === 0 ? 'No passive voice constructions found.' :
      passives.map(p => `
        <div class="warning">
          Found passive voice "<span class="highlight">${p.construction}</span>" in: "${p.context}"
        </div>
      `).join('')}
  `;
  resultsDiv.appendChild(passiveDiv);
  
  // Duplicate Words
  const duplicateDiv = document.createElement('div');
  duplicateDiv.className = 'category';
  duplicateDiv.innerHTML = `
    <h2>Duplicate Words</h2>
    ${duplicates.length === 0 ? 'No duplicate words found.' :
      duplicates.map(d => `
        <div class="warning">
          Found duplicate word "<span class="highlight">${d.word}</span>" in: "${d.context}"
        </div>
      `).join('')}
  `;
  resultsDiv.appendChild(duplicateDiv);
}

// Set up event listener
document.getElementById('input').addEventListener('input', (e) => {
  const text = e.target.value;
  const weasels = findWeaselWords(text);
  const passives = findPassiveVoice(text);
  const duplicates = findDuplicateWords(text);
  displayResults(weasels, passives, duplicates);
});
</script>
</body>
</html>

The key changes from the original version include:

  1. Added the stripPunctuation() function to handle punctuation in word comparisons
  2. Updated findDuplicateWords() to use the stripped versions of words for comparison
  3. Also applied stripPunctuation() to the weasel words and passive voice checks for consistency
  4. Added extra validation to ensure we don't match empty strings after stripping punctuation

The analyzer will now catch duplicate words even when they have different punctuation, while still showing the original text (with punctuation) in the results. Would you like to test it with any specific examples?

@simonw
Copy link
Author

simonw commented Dec 14, 2024

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment