Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
search and replace in an Open XML word document.
/*
references:
WindowsBase
Open XML Format SDK 2.5 - from NuGet
*/
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
using DocumentFormat.OpenXml;
namespace SearchAndReplaceInText
{
public class SearchAndReplaceInText
{
public static List<int> AllIndexesOf(string str, string substr, bool ignoreCase = false) // modified of http://stackoverflow.com/a/14308894/466363
{
var indexes = new List<int>();
if (string.IsNullOrWhiteSpace(str) ||
string.IsNullOrWhiteSpace(substr))
{
return indexes;
}
int index = 0;
while ((index = str.IndexOf(substr, index, ignoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal)) != -1)
{
indexes.Add(index++);
}
return indexes;
}
public static void openxml_replace_text(OpenXmlElement el, string from, string to)//version 2
{
// this is a quite smart and simple algorithm by Shimon Doodkin
// the idea is to concat all texts and search it as string.
// then replace text by positions step by step
StringBuilder innertext = new StringBuilder(); foreach (Text eltext in el.Descendants<Text>()) { innertext.Append(eltext.Text); } // maybe to add space if previous element had no space at the end and this element has no space at beggining or add new line...no... but this problem is only with tables..
string innertextstr = innertext.ToString();
List<int> foundat = AllIndexesOf(innertextstr, from);
List<int> foundatend = new List<int>();
for (int z = 0; z < foundat.Count; z++)
{
foundatend.Add(foundat[z] + from.Length - 1);
}
//if (foundat.Count != 0)
//{
// Console.WriteLine("from:'" + from + "' between " + foundat[0] + " to " + foundatend[0]);
// for (int i = 0; i < innertextstr.Length; i++)
// {
// Console.WriteLine(" [" + i + "]: " + ((int)innertextstr[i]) + " '" + innertextstr[i] + "'");
// }
//}
//Console.WriteLine(innertext.ToString().Contains(from) ? "contains" : "not found");
List<Text> tofixnewlines = new List<Text>();
List<Text> todeleteempty = new List<Text>();
List<string> tofixnewlines_str = new List<string>() ;
int currenttext_from = 0, currenttext_to = -1;
int innertextpos = 0;
if (foundat.Count != 0)
{
foreach (Text eltext in el.Descendants<Text>())
{
currenttext_from = currenttext_to + 1;
currenttext_to += eltext.Text.Length;
//Console.WriteLine("currenttext_from: " + currenttext_from + " currenttext_to: " + currenttext_to);
if (foundat.Count == 0) break;
if (foundat.First() <= currenttext_from && currenttext_from <= foundatend.First() // the beggining of this block is inside a found
|| foundat.First() <= currenttext_to && currenttext_to <= foundatend.First() // the end of this block is inside a found
|| currenttext_to <= foundat.First() && foundatend.First() <= currenttext_to // found is inside block
)
{
//Console.WriteLine("#"+eltext.OuterXml);
StringBuilder newtext = new StringBuilder();
//is innertextpos in a match?
innertextpos = currenttext_from;
for (int curchar = 0; curchar < eltext.Text.Length; curchar++)
{
if (foundat.Count == 0) break;
if (innertextpos == foundat.First())
{
newtext.Append(to);
}
else if (innertextpos >= foundat.First() && innertextpos <= foundatend.First())
{
int replacewithcharat = innertextpos - foundat.First();
//newtext.Append(to[replacewithcharat]);
if (innertextpos == foundatend.First())
{
//if (replacewithcharat < to.Length)
//{
//newtext.Append(to.Substring(replacewithcharat + 1));
//}
//append add rest;
foundat.RemoveAt(0);
foundatend.RemoveAt(0);
}
}
else
newtext.Append(eltext.Text[curchar]);
innertextpos++;
}
string newtextstr = newtext.ToString();
if (newtextstr.IndexOf('\n') == -1)
eltext.Text = newtextstr;
else
{
eltext.Text = "to be replaced";
tofixnewlines.Add(eltext);
tofixnewlines_str.Add(newtextstr);
}
if (newtextstr.Length == 0)
{
todeleteempty.Add(eltext);
}
/*
* example word document with a newline
<w:body>
<w:p w:rsidR="00377636" w:rsidRDefault="00F653FC">
<w:pPr>
<w:rPr>
<w:rtl />
</w:rPr>
</w:pPr>
<w:r>
<w:t>AAA</w:t>
</w:r>
<w:r>
<w:rPr>
<w:rFonts w:hint="cs" />
<w:rtl />
</w:rPr>
<w:br />
</w:r>
<w:r>
<w:t>BBB</w:t>
</w:r>
<w:bookmarkStart w:id="0" w:name="_GoBack" />
<w:bookmarkEnd w:id="0" />
</w:p>
<w:sectPr w:rsidR="00377636" w:rsidSect="002510AE">
<w:pgSz w:w="11906" w:h="16838" />
<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="708" w:footer="708" w:gutter="0" />
<w:cols w:space="708" />
<w:bidi />
<w:rtlGutter />
<w:docGrid w:linePitch="360" />
</w:sectPr>
</w:body>
*/
}
// else
// Console.WriteLine(eltext.OuterXml);
}
//fix newlines:
for (int i = 0; i < tofixnewlines.Count; i++)
{
string[] lines = tofixnewlines_str[i].Replace("\r", "").Split('\n');
Text last_el = tofixnewlines[i];
OpenXmlElement newline_el;
OpenXmlElement copy_el;
last_el.Text = lines[0];
Text next_el;
for (int j = 1; j < lines.Length; j++)
{
//create nextline text
copy_el = last_el.Parent.CloneNode(true);
next_el = copy_el.Descendants<Text>().First();
next_el.Text = lines[j];
//create newline //"<w:r><w:rPr><w:rFonts w:hint="cs" /><w:rtl /></w:rPr><w:br /></w:r>"
newline_el = last_el.Parent.CloneNode(true);
IEnumerable<OpenXmlElement> se = newline_el.ChildElements.Where(e => e.LocalName != "rPr");
foreach (OpenXmlElement item in se) item.Remove();
newline_el.AppendChild(new DocumentFormat.OpenXml.Wordprocessing.Break());//<w:br />
last_el.Parent.InsertAfterSelf(copy_el);
last_el.Parent.InsertAfterSelf(newline_el);//add a newline after the last_el.Parent(the add order is switched,i always add after the first element but in reverse order)
last_el = next_el;
}
}
for (int i = 0; i <todeleteempty.Count; i++)
{
Text eltext = todeleteempty[i];
//if (eltext.Parent.ChildElements.Count <= 2 && newtextstr.Length == 0)// run.childern<=2 means Run countains the only w:rPr and w:t or just w:t
// {
eltext.Parent.Remove();//remove empty run,not sure if this is good, i dont know mybe run could countain other elements besides text like images.
// }
}
}
}
}
}
@markh1967

This comment has been minimized.

Copy link

commented Jul 12, 2016

Thanks for providing this code. I've used it in a project of mine to mailmerge word documents but I've found a bug that sometimes overwrites all of a text node when it should only overwrite part of one.

For instance, a text node containing '{Promotion} - {Contract}' replacing {Promotion} with 'A test promotion' will overwrite the entire node rather than just the start of it.

The fix is to add between lines 97 and 98:

                                innertextpos += (to.Length - 1);
                                curchar += from.Length;
@marciocristian

This comment has been minimized.

Copy link

commented Nov 22, 2017

I'm having an infinite loop with AllIndexOf. To fix it, I simply got the code from https://stackoverflow.com/a/2641383/6000539

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.