Skip to content

Instantly share code, notes, and snippets.

@shimondoodkin
Last active September 9, 2021 04:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save shimondoodkin/7471075 to your computer and use it in GitHub Desktop.
Save shimondoodkin/7471075 to your computer and use it in GitHub Desktop.
search and replace in an Open XML word document.
/*
references:
WindowsBase
Open XML Format SDK 2.5 - from NuGet
*/
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
using DocumentFormat.OpenXml;
namespace SearchAndReplaceInText
{
public class SearchAndReplaceInText
{
public static List<int> AllIndexesOf(string str, string substr, bool ignoreCase = false) // modified of http://stackoverflow.com/a/14308894/466363
{
var indexes = new List<int>();
if (string.IsNullOrWhiteSpace(str) ||
string.IsNullOrWhiteSpace(substr))
{
return indexes;
}
int index = 0;
while ((index = str.IndexOf(substr, index, ignoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal)) != -1)
{
indexes.Add(index++);
}
return indexes;
}
public static void openxml_replace_text(OpenXmlElement el, string from, string to)//version 2
{
// this is a quite smart and simple algorithm by Shimon Doodkin
// the idea is to concat all texts and search it as string.
// then replace text by positions step by step
StringBuilder innertext = new StringBuilder(); foreach (Text eltext in el.Descendants<Text>()) { innertext.Append(eltext.Text); } // maybe to add space if previous element had no space at the end and this element has no space at beggining or add new line...no... but this problem is only with tables..
string innertextstr = innertext.ToString();
List<int> foundat = AllIndexesOf(innertextstr, from);
List<int> foundatend = new List<int>();
for (int z = 0; z < foundat.Count; z++)
{
foundatend.Add(foundat[z] + from.Length - 1);
}
//if (foundat.Count != 0)
//{
// Console.WriteLine("from:'" + from + "' between " + foundat[0] + " to " + foundatend[0]);
// for (int i = 0; i < innertextstr.Length; i++)
// {
// Console.WriteLine(" [" + i + "]: " + ((int)innertextstr[i]) + " '" + innertextstr[i] + "'");
// }
//}
//Console.WriteLine(innertext.ToString().Contains(from) ? "contains" : "not found");
List<Text> tofixnewlines = new List<Text>();
List<Text> todeleteempty = new List<Text>();
List<string> tofixnewlines_str = new List<string>() ;
int currenttext_from = 0, currenttext_to = -1;
int innertextpos = 0;
if (foundat.Count != 0)
{
foreach (Text eltext in el.Descendants<Text>())
{
currenttext_from = currenttext_to + 1;
currenttext_to += eltext.Text.Length;
//Console.WriteLine("currenttext_from: " + currenttext_from + " currenttext_to: " + currenttext_to);
if (foundat.Count == 0) break;
if (foundat.First() <= currenttext_from && currenttext_from <= foundatend.First() // the beggining of this block is inside a found
|| foundat.First() <= currenttext_to && currenttext_to <= foundatend.First() // the end of this block is inside a found
|| currenttext_to <= foundat.First() && foundatend.First() <= currenttext_to // found is inside block
)
{
//Console.WriteLine("#"+eltext.OuterXml);
StringBuilder newtext = new StringBuilder();
//is innertextpos in a match?
innertextpos = currenttext_from;
for (int curchar = 0; curchar < eltext.Text.Length; curchar++)
{
if (foundat.Count == 0) break;
if (innertextpos == foundat.First())
{
newtext.Append(to);
}
else if (innertextpos >= foundat.First() && innertextpos <= foundatend.First())
{
int replacewithcharat = innertextpos - foundat.First();
//newtext.Append(to[replacewithcharat]);
if (innertextpos == foundatend.First())
{
//if (replacewithcharat < to.Length)
//{
//newtext.Append(to.Substring(replacewithcharat + 1));
//}
//append add rest;
foundat.RemoveAt(0);
foundatend.RemoveAt(0);
}
}
else
newtext.Append(eltext.Text[curchar]);
innertextpos++;
}
string newtextstr = newtext.ToString();
if (newtextstr.IndexOf('\n') == -1)
eltext.Text = newtextstr;
else
{
eltext.Text = "to be replaced";
tofixnewlines.Add(eltext);
tofixnewlines_str.Add(newtextstr);
}
if (newtextstr.Length == 0)
{
todeleteempty.Add(eltext);
}
/*
* example word document with a newline
<w:body>
<w:p w:rsidR="00377636" w:rsidRDefault="00F653FC">
<w:pPr>
<w:rPr>
<w:rtl />
</w:rPr>
</w:pPr>
<w:r>
<w:t>AAA</w:t>
</w:r>
<w:r>
<w:rPr>
<w:rFonts w:hint="cs" />
<w:rtl />
</w:rPr>
<w:br />
</w:r>
<w:r>
<w:t>BBB</w:t>
</w:r>
<w:bookmarkStart w:id="0" w:name="_GoBack" />
<w:bookmarkEnd w:id="0" />
</w:p>
<w:sectPr w:rsidR="00377636" w:rsidSect="002510AE">
<w:pgSz w:w="11906" w:h="16838" />
<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="708" w:footer="708" w:gutter="0" />
<w:cols w:space="708" />
<w:bidi />
<w:rtlGutter />
<w:docGrid w:linePitch="360" />
</w:sectPr>
</w:body>
*/
}
// else
// Console.WriteLine(eltext.OuterXml);
}
//fix newlines:
for (int i = 0; i < tofixnewlines.Count; i++)
{
string[] lines = tofixnewlines_str[i].Replace("\r", "").Split('\n');
Text last_el = tofixnewlines[i];
OpenXmlElement newline_el;
OpenXmlElement copy_el;
last_el.Text = lines[0];
Text next_el;
for (int j = 1; j < lines.Length; j++)
{
//create nextline text
copy_el = last_el.Parent.CloneNode(true);
next_el = copy_el.Descendants<Text>().First();
next_el.Text = lines[j];
//create newline //"<w:r><w:rPr><w:rFonts w:hint="cs" /><w:rtl /></w:rPr><w:br /></w:r>"
newline_el = last_el.Parent.CloneNode(true);
IEnumerable<OpenXmlElement> se = newline_el.ChildElements.Where(e => e.LocalName != "rPr");
foreach (OpenXmlElement item in se) item.Remove();
newline_el.AppendChild(new DocumentFormat.OpenXml.Wordprocessing.Break());//<w:br />
last_el.Parent.InsertAfterSelf(copy_el);
last_el.Parent.InsertAfterSelf(newline_el);//add a newline after the last_el.Parent(the add order is switched,i always add after the first element but in reverse order)
last_el = next_el;
}
}
for (int i = 0; i <todeleteempty.Count; i++)
{
Text eltext = todeleteempty[i];
//if (eltext.Parent.ChildElements.Count <= 2 && newtextstr.Length == 0)// run.childern<=2 means Run countains the only w:rPr and w:t or just w:t
// {
eltext.Parent.Remove();//remove empty run,not sure if this is good, i dont know mybe run could countain other elements besides text like images.
// }
}
}
}
}
}
@markh1967
Copy link

Thanks for providing this code. I've used it in a project of mine to mailmerge word documents but I've found a bug that sometimes overwrites all of a text node when it should only overwrite part of one.

For instance, a text node containing '{Promotion} - {Contract}' replacing {Promotion} with 'A test promotion' will overwrite the entire node rather than just the start of it.

The fix is to add between lines 97 and 98:

                                innertextpos += (to.Length - 1);
                                curchar += from.Length;

@marciocrmendes
Copy link

I'm having an infinite loop with AllIndexOf. To fix it, I simply got the code from https://stackoverflow.com/a/2641383/6000539

@r2d2tm
Copy link

r2d2tm commented Mar 19, 2020

this code cannot fully work...

That is not a possible condition
|| currenttext_to <= foundat.First() && foundatend.First() <= currenttext_to // found is inside block
must be replace with
|| currenttext_from <= foundat.First() && foundatend.First() <= currenttext_to // found is inside block

and with the new condition the code does not work properly....
example :
<w:t>#AGENCE# - #AgenceCP# #AgenceVille#</w:t>
became in ms word : AGENCE PAS DE CALAIS- 62000ceVille#
the last tag is truncated...

@ramvirsingh660
Copy link

I'm using Open XML and I should change the header tag value of a word file template? Please let me know.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment