Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mmizutani/e51ebbdce0d5642a20298a1ce7a474a5 to your computer and use it in GitHub Desktop.
Save mmizutani/e51ebbdce0d5642a20298a1ce7a474a5 to your computer and use it in GitHub Desktop.
search and replace in an Open XML word document.
/*
references:
WindowsBase
Open XML Format SDK 2.5 - from NuGet
*/
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
using DocumentFormat.OpenXml;
namespace SearchAndReplaceInText
{
public class SearchAndReplaceInText
{
public static List<int> AllIndexesOf(string str, string substr, bool ignoreCase = false) // modified of http://stackoverflow.com/a/14308894/466363
{
var indexes = new List<int>();
if (string.IsNullOrWhiteSpace(str) ||
string.IsNullOrWhiteSpace(substr))
{
return indexes;
}
int index = 0;
while ((index = str.IndexOf(substr, index, ignoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal)) != -1)
{
indexes.Add(index++);
}
return indexes;
}
public static void openxml_replace_text(OpenXmlElement el, string from, string to)//version 2
{
// this is a quite smart and simple algorithm by Shimon Doodkin
// the idea is to concat all texts and search it as string.
// then replace text by positions step by step
StringBuilder innertext = new StringBuilder(); foreach (Text eltext in el.Descendants<Text>()) { innertext.Append(eltext.Text); } // maybe to add space if previous element had no space at the end and this element has no space at beggining or add new line...no... but this problem is only with tables..
string innertextstr = innertext.ToString();
List<int> foundat = AllIndexesOf(innertextstr, from);
List<int> foundatend = new List<int>();
for (int z = 0; z < foundat.Count; z++)
{
foundatend.Add(foundat[z] + from.Length - 1);
}
//if (foundat.Count != 0)
//{
// Console.WriteLine("from:'" + from + "' between " + foundat[0] + " to " + foundatend[0]);
// for (int i = 0; i < innertextstr.Length; i++)
// {
// Console.WriteLine(" [" + i + "]: " + ((int)innertextstr[i]) + " '" + innertextstr[i] + "'");
// }
//}
//Console.WriteLine(innertext.ToString().Contains(from) ? "contains" : "not found");
List<Text> tofixnewlines = new List<Text>();
List<Text> todeleteempty = new List<Text>();
List<string> tofixnewlines_str = new List<string>() ;
int currenttext_from = 0, currenttext_to = -1;
int innertextpos = 0;
if (foundat.Count != 0)
{
foreach (Text eltext in el.Descendants<Text>())
{
currenttext_from = currenttext_to + 1;
currenttext_to += eltext.Text.Length;
//Console.WriteLine("currenttext_from: " + currenttext_from + " currenttext_to: " + currenttext_to);
if (foundat.Count == 0) break;
if (foundat.First() <= currenttext_from && currenttext_from <= foundatend.First() // the beggining of this block is inside a found
|| foundat.First() <= currenttext_to && currenttext_to <= foundatend.First() // the end of this block is inside a found
|| currenttext_to <= foundat.First() && foundatend.First() <= currenttext_to // found is inside block
)
{
//Console.WriteLine("#"+eltext.OuterXml);
StringBuilder newtext = new StringBuilder();
//is innertextpos in a match?
innertextpos = currenttext_from;
for (int curchar = 0; curchar < eltext.Text.Length; curchar++)
{
if (foundat.Count == 0) break;
if (innertextpos == foundat.First())
{
newtext.Append(to);
}
else if (innertextpos >= foundat.First() && innertextpos <= foundatend.First())
{
int replacewithcharat = innertextpos - foundat.First();
//newtext.Append(to[replacewithcharat]);
if (innertextpos == foundatend.First())
{
//if (replacewithcharat < to.Length)
//{
//newtext.Append(to.Substring(replacewithcharat + 1));
//}
//append add rest;
foundat.RemoveAt(0);
foundatend.RemoveAt(0);
}
}
else
newtext.Append(eltext.Text[curchar]);
innertextpos++;
}
string newtextstr = newtext.ToString();
if (newtextstr.IndexOf('\n') == -1)
eltext.Text = newtextstr;
else
{
eltext.Text = "to be replaced";
tofixnewlines.Add(eltext);
tofixnewlines_str.Add(newtextstr);
}
if (newtextstr.Length == 0)
{
todeleteempty.Add(eltext);
}
/*
* example word document with a newline
<w:body>
<w:p w:rsidR="00377636" w:rsidRDefault="00F653FC">
<w:pPr>
<w:rPr>
<w:rtl />
</w:rPr>
</w:pPr>
<w:r>
<w:t>AAA</w:t>
</w:r>
<w:r>
<w:rPr>
<w:rFonts w:hint="cs" />
<w:rtl />
</w:rPr>
<w:br />
</w:r>
<w:r>
<w:t>BBB</w:t>
</w:r>
<w:bookmarkStart w:id="0" w:name="_GoBack" />
<w:bookmarkEnd w:id="0" />
</w:p>
<w:sectPr w:rsidR="00377636" w:rsidSect="002510AE">
<w:pgSz w:w="11906" w:h="16838" />
<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="708" w:footer="708" w:gutter="0" />
<w:cols w:space="708" />
<w:bidi />
<w:rtlGutter />
<w:docGrid w:linePitch="360" />
</w:sectPr>
</w:body>
*/
}
// else
// Console.WriteLine(eltext.OuterXml);
}
//fix newlines:
for (int i = 0; i < tofixnewlines.Count; i++)
{
string[] lines = tofixnewlines_str[i].Replace("\r", "").Split('\n');
Text last_el = tofixnewlines[i];
OpenXmlElement newline_el;
OpenXmlElement copy_el;
last_el.Text = lines[0];
Text next_el;
for (int j = 1; j < lines.Length; j++)
{
//create nextline text
copy_el = last_el.Parent.CloneNode(true);
next_el = copy_el.Descendants<Text>().First();
next_el.Text = lines[j];
//create newline //"<w:r><w:rPr><w:rFonts w:hint="cs" /><w:rtl /></w:rPr><w:br /></w:r>"
newline_el = last_el.Parent.CloneNode(true);
IEnumerable<OpenXmlElement> se = newline_el.ChildElements.Where(e => e.LocalName != "rPr");
foreach (OpenXmlElement item in se) item.Remove();
newline_el.AppendChild(new DocumentFormat.OpenXml.Wordprocessing.Break());//<w:br />
last_el.Parent.InsertAfterSelf(copy_el);
last_el.Parent.InsertAfterSelf(newline_el);//add a newline after the last_el.Parent(the add order is switched,i always add after the first element but in reverse order)
last_el = next_el;
}
}
for (int i = 0; i <todeleteempty.Count; i++)
{
Text eltext = todeleteempty[i];
//if (eltext.Parent.ChildElements.Count <= 2 && newtextstr.Length == 0)// run.childern<=2 means Run countains the only w:rPr and w:t or just w:t
// {
eltext.Parent.Remove();//remove empty run,not sure if this is good, i dont know mybe run could countain other elements besides text like images.
// }
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment