human (Dec 14, 2024, 09:55 AM)
I don't think it is working properly. The second(?) pattern match is bad. For example, for
It is quite difficult to find untainted samples. We used various methods to isolate four samples.
it gives
Found "quite" in: "is quite difficult to find untainted samples."
Found "various" in: ""
Looking at the code, the issue is that both findWeaselWords
and getContext
construct a list of words but do it differently:
const words = text.toLowerCase().split(/\b/)
vs
const words = text.split(/\s+/)
paste.txt
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Writing Style Analyzer</title>
<style>
* {
box-sizing: border-box;
}
body {
font-family: Helvetica, Arial, sans-serif;
line-height: 1.6;
margin: 0;
padding: 20px;
background: #f5f5f5;
}
.container {
max-width: 800px;
margin: 0 auto;
background: white;
padding: 20px;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}
h1 {
margin-top: 0;
color: #333;
}
textarea {
width: 100%;
height: 200px;
padding: 12px;
border: 1px solid #ddd;
border-radius: 4px;
margin-bottom: 20px;
font-size: 16px;
font-family: inherit;
}
.results {
margin-top: 20px;
}
.category {
margin-bottom: 20px;
padding: 15px;
background: #f8f9fa;
border-radius: 4px;
}
.category h2 {
margin-top: 0;
color: #444;
font-size: 1.2em;
}
.highlight {
background: #ffd700;
padding: 2px 4px;
border-radius: 2px;
}
.warning {
color: #856404;
background-color: #fff3cd;
border: 1px solid #ffeeba;
padding: 10px;
margin-bottom: 10px;
border-radius: 4px;
}
</style>
</head>
<body>
<div class="container">
<h1>Writing Style Analyzer</h1>
<p>Adapted from <a href="https://matt.might.net/articles/shell-scripts-for-passive-voice-weasel-words-duplicates/">these shell scripts</a> published by Matt Might.</p>
<p>Paste your text below to check for weasel words, passive voice, and duplicate words:</p>
<textarea id="input" placeholder="Enter your text here..."></textarea>
<div id="results" class="results"></div>
</div>
<script type="module">
// Weasel words from the bash script
const weaselWords = [
'many', 'various', 'very', 'fairly', 'several', 'extremely',
'exceedingly', 'quite', 'remarkably', 'few', 'surprisingly',
'mostly', 'largely', 'huge', 'tiny', 'excellent', 'interestingly',
'significantly', 'substantially', 'clearly', 'vast', 'relatively',
'completely'
]
// Common irregular verbs for passive voice detection
const irregularVerbs = [
'awoken', 'been', 'born', 'beat', 'become', 'begun', 'bent',
'bound', 'bitten', 'bled', 'blown', 'broken', 'brought',
'built', 'burnt', 'bought', 'caught', 'chosen', 'come',
'dealt', 'done', 'drawn', 'driven', 'eaten', 'fallen',
'fought', 'found', 'flown', 'forgotten', 'given', 'gone',
'grown', 'hung', 'heard', 'hidden', 'held', 'kept', 'known',
'laid', 'led', 'left', 'lost', 'made', 'meant', 'met', 'paid',
'put', 'read', 'run', 'said', 'seen', 'sold', 'sent', 'set',
'shown', 'shut', 'sung', 'sat', 'slept', 'spoken', 'spent',
'stood', 'taken', 'taught', 'told', 'thought', 'thrown',
'understood', 'worn', 'won', 'written'
]
function findWeaselWords(text) {
const results = []
const words = text.toLowerCase().split(/\b/)
words.forEach((word, index) => {
if (weaselWords.includes(word.trim())) {
results.push({
word: word.trim(),
index: index,
context: getContext(text, index)
})
}
})
return results
}
function findPassiveVoice(text) {
const results = []
const beVerbs = ['am', 'is', 'are', 'was', 'were', 'be', 'been', 'being']
const words = text.toLowerCase().split(/\s+/)
words.forEach((word, index) => {
if (beVerbs.includes(word)) {
const nextWord = words[index + 1]
if (nextWord && (
nextWord.endsWith('ed') ||
irregularVerbs.includes(nextWord)
)) {
results.push({
construction: `${word} ${nextWord}`,
context: getContext(text, index)
})
}
}
})
return results
}
function findDuplicateWords(text) {
const results = []
const words = text.toLowerCase().split(/\s+/)
words.forEach((word, index) => {
if (index > 0 && word === words[index - 1]) {
results.push({
word: word,
context: getContext(text, index)
})
}
})
return results
}
function getContext(text, index) {
const words = text.split(/\s+/)
const start = Math.max(0, index - 3)
const end = Math.min(words.length, index + 4)
return words.slice(start, end).join(' ')
}
function displayResults(weasels, passives, duplicates) {
const resultsDiv = document.getElementById('results')
resultsDiv.innerHTML = ''
// Weasel Words
const weaselDiv = document.createElement('div')
weaselDiv.className = 'category'
weaselDiv.innerHTML = `
<h2>Weasel Words</h2>
${weasels.length === 0 ? 'No weasel words found.' :
weasels.map(w => `
<div class="warning">
Found "<span class="highlight">${w.word}</span>" in: "${w.context}"
</div>
`).join('')}
`
resultsDiv.appendChild(weaselDiv)
// Passive Voice
const passiveDiv = document.createElement('div')
passiveDiv.className = 'category'
passiveDiv.innerHTML = `
<h2>Passive Voice</h2>
${passives.length === 0 ? 'No passive voice constructions found.' :
passives.map(p => `
<div class="warning">
Found passive voice "<span class="highlight">${p.construction}</span>" in: "${p.context}"
</div>
`).join('')}
`
resultsDiv.appendChild(passiveDiv)
// Duplicate Words
const duplicateDiv = document.createElement('div')
duplicateDiv.className = 'category'
duplicateDiv.innerHTML = `
<h2>Duplicate Words</h2>
${duplicates.length === 0 ? 'No duplicate words found.' :
duplicates.map(d => `
<div class="warning">
Found duplicate word "<span class="highlight">${d.word}</span>" in: "${d.context}"
</div>
`).join('')}
`
resultsDiv.appendChild(duplicateDiv)
}
// Set up event listener
document.getElementById('input').addEventListener('input', (e) => {
const text = e.target.value
const weasels = findWeaselWords(text)
const passives = findPassiveVoice(text)
const duplicates = findDuplicateWords(text)
displayResults(weasels, passives, duplicates)
})
</script>
</body>
</html>
assistant (Dec 14, 2024, 09:56 AM)
You're absolutely right about the inconsistency in word splitting. The issue arises because findWeaselWords
uses word boundaries (\b
) while getContext
uses whitespace (\s+
), leading to mismatched indices and incorrect context extraction.
Let's modify the code to use consistent word splitting. I'll create an updated version:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Writing Style Analyzer</title>
<style>
* {
box-sizing: border-box;
}
body {
font-family: Helvetica, Arial, sans-serif;
line-height: 1.6;
margin: 0;
padding: 20px;
background: #f5f5f5;
}
.container {
max-width: 800px;
margin: 0 auto;
background: white;
padding: 20px;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}
h1 {
margin-top: 0;
color: #333;
}
textarea {
width: 100%;
height: 200px;
padding: 12px;
border: 1px solid #ddd;
border-radius: 4px;
margin-bottom: 20px;
font-size: 16px;
font-family: inherit;
}
.results {
margin-top: 20px;
}
.category {
margin-bottom: 20px;
padding: 15px;
background: #f8f9fa;
border-radius: 4px;
}
.category h2 {
margin-top: 0;
color: #444;
font-size: 1.2em;
}
.highlight {
background: #ffd700;
padding: 2px 4px;
border-radius: 2px;
}
.warning {
color: #856404;
background-color: #fff3cd;
border: 1px solid #ffeeba;
padding: 10px;
margin-bottom: 10px;
border-radius: 4px;
}
</style>
</head>
<body>
<div class="container">
<h1>Writing Style Analyzer</h1>
<p>Adapted from <a href="https://matt.might.net/articles/shell-scripts-for-passive-voice-weasel-words-duplicates/">these shell scripts</a> published by Matt Might.</p>
<p>Paste your text below to check for weasel words, passive voice, and duplicate words:</p>
<textarea id="input" placeholder="Enter your text here..."></textarea>
<div id="results" class="results"></div>
</div>
<script type="module">
// Weasel words from the bash script
const weaselWords = [
'many', 'various', 'very', 'fairly', 'several', 'extremely',
'exceedingly', 'quite', 'remarkably', 'few', 'surprisingly',
'mostly', 'largely', 'huge', 'tiny', 'excellent', 'interestingly',
'significantly', 'substantially', 'clearly', 'vast', 'relatively',
'completely'
]
// Common irregular verbs for passive voice detection
const irregularVerbs = [
'awoken', 'been', 'born', 'beat', 'become', 'begun', 'bent',
'bound', 'bitten', 'bled', 'blown', 'broken', 'brought',
'built', 'burnt', 'bought', 'caught', 'chosen', 'come',
'dealt', 'done', 'drawn', 'driven', 'eaten', 'fallen',
'fought', 'found', 'flown', 'forgotten', 'given', 'gone',
'grown', 'hung', 'heard', 'hidden', 'held', 'kept', 'known',
'laid', 'led', 'left', 'lost', 'made', 'meant', 'met', 'paid',
'put', 'read', 'run', 'said', 'seen', 'sold', 'sent', 'set',
'shown', 'shut', 'sung', 'sat', 'slept', 'spoken', 'spent',
'stood', 'taken', 'taught', 'told', 'thought', 'thrown',
'understood', 'worn', 'won', 'written'
]
// Helper function to get word positions with their original text
function getWordPositions(text) {
const words = []
const regex = /\S+/g
let match
while ((match = regex.exec(text)) !== null) {
words.push({
word: match[0],
index: match.index,
length: match[0].length
})
}
return words
}
function getContext(text, wordPosition, windowSize = 5) {
const start = Math.max(0, wordPosition.index)
const end = Math.min(text.length, wordPosition.index + wordPosition.length + 100)
const context = text.slice(start, end)
// Trim the context to the nearest word boundaries
const trimmedContext = context
.replace(/^\S*\s*/, '') // Trim partial word at start
.replace(/\s*\S*$/, '') // Trim partial word at end
return trimmedContext
}
function findWeaselWords(text) {
const results = []
const wordPositions = getWordPositions(text)
wordPositions.forEach(pos => {
if (weaselWords.includes(pos.word.toLowerCase())) {
results.push({
word: pos.word,
context: getContext(text, pos)
})
}
})
return results
}
function findPassiveVoice(text) {
const results = []
const beVerbs = ['am', 'is', 'are', 'was', 'were', 'be', 'been', 'being']
const wordPositions = getWordPositions(text)
for (let i = 0; i < wordPositions.length - 1; i++) {
const currentWord = wordPositions[i].word.toLowerCase()
const nextWord = wordPositions[i + 1].word.toLowerCase()
if (beVerbs.includes(currentWord)) {
if (nextWord.endsWith('ed') || irregularVerbs.includes(nextWord)) {
results.push({
construction: `${currentWord} ${nextWord}`,
context: getContext(text, wordPositions[i])
})
}
}
}
return results
}
function findDuplicateWords(text) {
const results = []
const wordPositions = getWordPositions(text)
for (let i = 1; i < wordPositions.length; i++) {
if (wordPositions[i].word.toLowerCase() === wordPositions[i - 1].word.toLowerCase()) {
results.push({
word: wordPositions[i].word,
context: getContext(text, wordPositions[i])
})
}
}
return results
}
function displayResults(weasels, passives, duplicates) {
const resultsDiv = document.getElementById('results')
resultsDiv.innerHTML = ''
// Weasel Words
const weaselDiv = document.createElement('div')
weaselDiv.className = 'category'
weaselDiv.innerHTML = `
<h2>Weasel Words</h2>
${weasels.length === 0 ? 'No weasel words found.' :
weasels.map(w => `
<div class="warning">
Found "<span class="highlight">${w.word}</span>" in: "${w.context}"
</div>
`).join('')}
`
resultsDiv.appendChild(weaselDiv)
// Passive Voice
const passiveDiv = document.createElement('div')
passiveDiv.className = 'category'
passiveDiv.innerHTML = `
<h2>Passive Voice</h2>
${passives.length === 0 ? 'No passive voice constructions found.' :
passives.map(p => `
<div class="warning">
Found passive voice "<span class="highlight">${p.construction}</span>" in: "${p.context}"
</div>
`).join('')}
`
resultsDiv.appendChild(passiveDiv)
// Duplicate Words
const duplicateDiv = document.createElement('div')
duplicateDiv.className = 'category'
duplicateDiv.innerHTML = `
<h2>Duplicate Words</h2>
${duplicates.length === 0 ? 'No duplicate words found.' :
duplicates.map(d => `
<div class="warning">
Found duplicate word "<span class="highlight">${d.word}</span>" in: "${d.context}"
</div>
`).join('')}
`
resultsDiv.appendChild(duplicateDiv)
}
// Set up event listener
document.getElementById('input').addEventListener('input', (e) => {
const text = e.target.value
const weasels = findWeaselWords(text)
const passives = findPassiveVoice(text)
const duplicates = findDuplicateWords(text)
displayResults(weasels, passives, duplicates)
})
</script>
</body>
</html>
The key changes I made to fix the issues:
- Added a new
getWordPositions
function that consistently finds words and their positions using\S+
regex (matches non-whitespace characters) - Modified
getContext
to work with character positions instead of word indices - Updated all three finder functions (weasel words, passive voice, duplicates) to use the new word position system
- Improved context extraction to show more natural word boundaries
The main benefit is that we now have a single, consistent way of identifying words and their positions in the text, which ensures accurate context display for all types of matches.
Would you like me to explain any of the changes in more detail?
human (Dec 14, 2024, 09:57 AM)
When displaying the examples include the previous three words prior to the match
assistant (Dec 14, 2024, 09:57 AM)
I'll modify the getContext
function to ensure it always includes the previous three words. Here's the update:
Find this:
function getContext(text, wordPosition, windowSize = 5) {
const start = Math.max(0, wordPosition.index)
const end = Math.min(text.length, wordPosition.index + wordPosition.length + 100)
const context = text.slice(start, end)
// Trim the context to the nearest word boundaries
const trimmedContext = context
.replace(/^\S*\s*/, '') // Trim partial word at start
.replace(/\s*\S*$/, '') // Trim partial word at end
return trimmedContext
Replace with this:
function getContext(text, wordPosition, prevWords = 3) {
// Find start position for previous words
let start = wordPosition.index
let wordCount = 0
for (let i = start - 1; i >= 0 && wordCount < prevWords; i--) {
if (i === 0 || text[i - 1] === ' ') {
wordCount++
if (wordCount === prevWords) {
start = i
break
}
}
}
const end = Math.min(text.length, wordPosition.index + wordPosition.length + 50)
const context = text.slice(start, end).trim()
return context
The key changes in the new getContext
function:
- Takes a
prevWords
parameter (defaulting to 3) to specify how many previous words to include - Walks backwards through the text to find the start of the desired number of previous words
- Uses a simpler trimming approach that just ensures the context starts and ends cleanly
- Reduced the forward context length from 100 to 50 characters since we're now more focused on the preceding words
The function will now consistently show the three words before each match, making it easier to understand the context. For your example text:
It is quite difficult to find untainted samples. We used various methods to isolate four samples.
It should now show contexts like:
It is quite difficult...
We used various methods...
Want to try it out with some text to verify the changes?
Deployed to https://tools.simonwillison.net/writing-style