harshildarji/docx_to_conll.py

## docx_to_conll.py
import json
import os
import string
import zipfile

from lxml import etree
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm

import docx
from docx import Document

# PART 1 - Extract comments and commented text from DOCX, and put them into JSON format.

# Annotation labels mapping
annotation_labels = {
    "DC": "Data Controller",
    "DP": "Data Processor",
    "DPO": "Data Protection Officer",
    "R": "Recipient",
    "TP": "Third Party",
    "A": "Authority",
    "DS": "Data Subject",
    "DSO": "Data Source",
    "RP": "Required Purpose",
    "NRP": "Not-Required Purpose",
    "P": "Processing",
    "NPD": "Non-Personal Data",
    "PD": "Personal Data",
    "OM": "Organisational Measure",
    "TM": "Technical Measure",
    "LB": "Legal Basis",
    "CONS": "Consent",
    "CONT": "Contract",
    "LI": "Legitimate Interest",
    "ADM": "Automated Decision Making",
    "RET": "Retention",
    "SEU": "Scale EU",
    "SNEU": "Scale Non-EU",
    "RI": "Right",
    "DSR15": "Art. 15 Right of access by the data subject",
    "DSR16": "Art. 16 Right to rectification",
    "DSR17": "Art. 17 Right to erasure (‘right to be forgotten’)",
    "DSR18": "Art. 18 Right to restriction of processing",
    "DSR19": "Art. 19 Notification obligation regarding rectification or erasure of personal data or restriction of processing",
    "DSR20": "Art. 20 Right to data portability",
    "DSR21": "Art. 21 Right to object",
    "DSR22": "Art. 22 Automated individual decision-making, including profiling",
    "LC": "Lodge Complaint",
}

annotation_keys = list(annotation_labels.keys())
xmlns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
comments_dict = {}

# Get a list of all docx files in the directory
directory = "docx"
files = os.listdir(directory)
files_list = [file for file in files if file.endswith(".docx")]

for docx_file in tqdm(files_list, desc="Total"):
    cmts = {}

    name = docx_file.split(".")[0].strip()
    comments_dict[f"{name}"] = []

    # Open the docx file as a zip
    docx_zip = zipfile.ZipFile(f"docx/{docx_file}")
    comments_xml = docx_zip.read("word/comments.xml")
    document_xml = docx_zip.read("word/document.xml")
    docx_zip.close()

    # Parse comments.xml to extract the comments
    et = etree.XML(comments_xml)
    comments = et.xpath("//w:comment", namespaces=xmlns)
    for c in comments:
        comment = c.xpath("string(.)", namespaces=xmlns)
        comment_id = c.xpath("@w:id", namespaces=xmlns)[0]
        cmts[comment_id] = comment

    # Parse document.xml to find text with associated comments
    root = etree.fromstring(document_xml)
    for k, v in tqdm(cmts.items(), desc=f"{name}"):
        annotation = v

        if v in annotation_keys:
            annotations_full = annotation_labels[annotation]

            # Construct XPath expression to find text associated with the comment
            xpath_expr = f'//w:commentRangeStart[@w:id="{k}"]/following::w:t[following::w:commentRangeEnd[@w:id="{k}"] and not(preceding::w:commentRangeStart[@w:id="{int(k)+1}"])]'
            comment_range_start_elements = root.xpath(xpath_expr, namespaces=root.nsmap)

            text = ""

            # Concatenate text elements within the comment range
            for element in comment_range_start_elements:
                if text and element.text:
                    # Check conditions for appending a space before the text
                    if (
                        text[-1] not in string.whitespace
                        and element.text[0]
                        not in string.whitespace + string.punctuation
                    ):
                        text += " "
                    text += element.text.replace("\u00A0", "") if element.text else ""
                elif element.text:
                    text += element.text.replace("\u00A0", "") if element.text else ""

                text = text.replace("( ", "(").replace("[ ", "[")

            if text:
                comments_dict[name].append((int(k), text, annotation, annotations_full))

    sorted_data = sorted(comments_dict[name], key=lambda x: x[0])
    comments_dict[name] = sorted_data

json_file = "comments.json"
with open(json_file, "w") as file:
    json.dump(comments_dict, file, indent=4)


# The XPath expression selects <w:t> elements that appear after a specific <w:commentRangeStart> element with a matching w:id attribute value, and before the corresponding <w:commentRangeEnd> element, while ensuring that there are no intervening <w:commentRangeStart> elements with a specific w:id attribute value. The {k} and {int(k)+1} are likely to be placeholders for dynamic values that will be substituted with actual values during runtime.

# //w:commentRangeStart: Selects all <w:commentRangeStart> elements in the XML document, regardless of their location.

# [@w:id="{k}"]: Filters the selected <w:commentRangeStart> elements by the w:id attribute value equal to {k}. This attribute value is likely to be a variable or placeholder that is being dynamically replaced with an actual value when the XPath expression is used.

# /following::w:t: Selects all <w:t> elements that appear after the filtered <w:commentRangeStart> element(s).

# [following::w:commentRangeEnd[@w:id="{k}"]: Filters the selected <w:t> elements further by checking if there is a <w:commentRangeEnd> element with a matching w:id attribute value of {k} that appears after the current <w:t> element.

# and not(preceding::w:commentRangeStart[@w:id="{int(k)+1}"]): Additionally filters the selected <w:t> elements by checking if there is no <w:commentRangeStart> element with a w:id attribute value equal to {int(k)+1} that appears before the current <w:t> element.


# PART 2 - Convert DOCX into CoNLL format with IOB tagging, with text as tokens and comments as labels.

# Function to convert a docx file to text
def convert_docx_to_text(docx_file):
    doc = docx.Document(docx_file)
    xml_content = doc._element.xml

    namespaces = {
        "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
        "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
    }

    # Create an XML parser with namespace mappings
    parser = etree.XMLParser()
    for prefix, uri in namespaces.items():
        etree.register_namespace(prefix, uri)

    tree = etree.fromstring(xml_content.encode("utf-8"), parser=parser)

    # Extract text from <w:t> elements and join them, adding a space after each <w:p>
    text = ""
    for paragraph in tree.findall(".//w:p", namespaces=namespaces):
        text += "".join(
            [
                t.text.replace("\xa0", " ")
                for t in paragraph.findall(".//w:t", namespaces=namespaces)
                if t.text
            ]
        )
        text += " "

    return text.strip()


# Function to split a string from the first match of a delimiter
def split_from_first_match(string, delimiter):
    parts = string.split(delimiter, 1)
    if len(parts) > 1:
        return parts[0], delimiter, parts[1]
    else:
        return False


# Function to split a string with punctuation into tokens
def split_string_with_punctuation(string):
    tokenizer = RegexpTokenizer(r"\w+|\S+")
    tokens = tokenizer.tokenize(string)
    return tokens


with open("comments.json", "r") as file:
    data = json.load(file)

conll = open(f"comments.conll", "w")

for docx_file in tqdm(files_list, desc="Total"):
    name = docx_file.split(".")[0].strip()
    text = convert_docx_to_text(f"docx/{docx_file}")

    conll = open(f"comments.conll", "a+")

    for _, delimiter, label, _ in data[name]:
        split_text = split_from_first_match(text, delimiter)
        if split_text:
            out_text, tag_text, text = split_text[0], split_text[1], split_text[2]

            split_out_text = split_string_with_punctuation(out_text)
            split_tag_text = split_string_with_punctuation(tag_text)

            for txt in split_out_text:
                conll.write(f"{txt}\tO\n")

            conll.write(f"{split_tag_text[0]}\tB-{label}\n")
            for txt in split_tag_text[1:]:
                conll.write(f"{txt}\tI-{label}\n")
        else:
            continue

    conll.write("\n")
    conll.close()
	import json
	import os
	import string
	import zipfile

	from lxml import etree
	from nltk.tokenize import RegexpTokenizer
	from tqdm import tqdm

	import docx
	from docx import Document

	# PART 1 - Extract comments and commented text from DOCX, and put them into JSON format.

	# Annotation labels mapping
	annotation_labels = {
	"DC": "Data Controller",
	"DP": "Data Processor",
	"DPO": "Data Protection Officer",
	"R": "Recipient",
	"TP": "Third Party",
	"A": "Authority",
	"DS": "Data Subject",
	"DSO": "Data Source",
	"RP": "Required Purpose",
	"NRP": "Not-Required Purpose",
	"P": "Processing",
	"NPD": "Non-Personal Data",
	"PD": "Personal Data",
	"OM": "Organisational Measure",
	"TM": "Technical Measure",
	"LB": "Legal Basis",
	"CONS": "Consent",
	"CONT": "Contract",
	"LI": "Legitimate Interest",
	"ADM": "Automated Decision Making",
	"RET": "Retention",
	"SEU": "Scale EU",
	"SNEU": "Scale Non-EU",
	"RI": "Right",
	"DSR15": "Art. 15 Right of access by the data subject",
	"DSR16": "Art. 16 Right to rectification",
	"DSR17": "Art. 17 Right to erasure (‘right to be forgotten’)",
	"DSR18": "Art. 18 Right to restriction of processing",
	"DSR19": "Art. 19 Notification obligation regarding rectification or erasure of personal data or restriction of processing",
	"DSR20": "Art. 20 Right to data portability",
	"DSR21": "Art. 21 Right to object",
	"DSR22": "Art. 22 Automated individual decision-making, including profiling",
	"LC": "Lodge Complaint",
	}

	annotation_keys = list(annotation_labels.keys())
	xmlns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
	comments_dict = {}

	# Get a list of all docx files in the directory
	directory = "docx"
	files = os.listdir(directory)
	files_list = [file for file in files if file.endswith(".docx")]

	for docx_file in tqdm(files_list, desc="Total"):
	cmts = {}

	name = docx_file.split(".")[0].strip()
	comments_dict[f"{name}"] = []

	# Open the docx file as a zip
	docx_zip = zipfile.ZipFile(f"docx/{docx_file}")
	comments_xml = docx_zip.read("word/comments.xml")
	document_xml = docx_zip.read("word/document.xml")
	docx_zip.close()

	# Parse comments.xml to extract the comments
	et = etree.XML(comments_xml)
	comments = et.xpath("//w:comment", namespaces=xmlns)
	for c in comments:
	comment = c.xpath("string(.)", namespaces=xmlns)
	comment_id = c.xpath("@w:id", namespaces=xmlns)[0]
	cmts[comment_id] = comment

	# Parse document.xml to find text with associated comments
	root = etree.fromstring(document_xml)
	for k, v in tqdm(cmts.items(), desc=f"{name}"):
	annotation = v

	if v in annotation_keys:
	annotations_full = annotation_labels[annotation]

	# Construct XPath expression to find text associated with the comment
	xpath_expr = f'//w:commentRangeStart[@w:id="{k}"]/following::w:t[following::w:commentRangeEnd[@w:id="{k}"] and not(preceding::w:commentRangeStart[@w:id="{int(k)+1}"])]'
	comment_range_start_elements = root.xpath(xpath_expr, namespaces=root.nsmap)

	text = ""

	# Concatenate text elements within the comment range
	for element in comment_range_start_elements:
	if text and element.text:
	# Check conditions for appending a space before the text
	if (
	text[-1] not in string.whitespace
	and element.text[0]
	not in string.whitespace + string.punctuation
	):
	text += " "
	text += element.text.replace("\u00A0", "") if element.text else ""
	elif element.text:
	text += element.text.replace("\u00A0", "") if element.text else ""

	text = text.replace("( ", "(").replace("[ ", "[")

	if text:
	comments_dict[name].append((int(k), text, annotation, annotations_full))

	sorted_data = sorted(comments_dict[name], key=lambda x: x[0])
	comments_dict[name] = sorted_data

	json_file = "comments.json"
	with open(json_file, "w") as file:
	json.dump(comments_dict, file, indent=4)


	# The XPath expression selects <w:t> elements that appear after a specific <w:commentRangeStart> element with a matching w:id attribute value, and before the corresponding <w:commentRangeEnd> element, while ensuring that there are no intervening <w:commentRangeStart> elements with a specific w:id attribute value. The {k} and {int(k)+1} are likely to be placeholders for dynamic values that will be substituted with actual values during runtime.

	# //w:commentRangeStart: Selects all <w:commentRangeStart> elements in the XML document, regardless of their location.

	# [@w:id="{k}"]: Filters the selected <w:commentRangeStart> elements by the w:id attribute value equal to {k}. This attribute value is likely to be a variable or placeholder that is being dynamically replaced with an actual value when the XPath expression is used.

	# /following::w:t: Selects all <w:t> elements that appear after the filtered <w:commentRangeStart> element(s).

	# [following::w:commentRangeEnd[@w:id="{k}"]: Filters the selected <w:t> elements further by checking if there is a <w:commentRangeEnd> element with a matching w:id attribute value of {k} that appears after the current <w:t> element.

	# and not(preceding::w:commentRangeStart[@w:id="{int(k)+1}"]): Additionally filters the selected <w:t> elements by checking if there is no <w:commentRangeStart> element with a w:id attribute value equal to {int(k)+1} that appears before the current <w:t> element.


	# PART 2 - Convert DOCX into CoNLL format with IOB tagging, with text as tokens and comments as labels.

	# Function to convert a docx file to text
	def convert_docx_to_text(docx_file):
	doc = docx.Document(docx_file)
	xml_content = doc._element.xml

	namespaces = {
	"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
	"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
	}

	# Create an XML parser with namespace mappings
	parser = etree.XMLParser()
	for prefix, uri in namespaces.items():
	etree.register_namespace(prefix, uri)

	tree = etree.fromstring(xml_content.encode("utf-8"), parser=parser)

	# Extract text from <w:t> elements and join them, adding a space after each <w:p>
	text = ""
	for paragraph in tree.findall(".//w:p", namespaces=namespaces):
	text += "".join(
	[
	t.text.replace("\xa0", " ")
	for t in paragraph.findall(".//w:t", namespaces=namespaces)
	if t.text
	]
	)
	text += " "

	return text.strip()


	# Function to split a string from the first match of a delimiter
	def split_from_first_match(string, delimiter):
	parts = string.split(delimiter, 1)
	if len(parts) > 1:
	return parts[0], delimiter, parts[1]
	else:
	return False


	# Function to split a string with punctuation into tokens
	def split_string_with_punctuation(string):
	tokenizer = RegexpTokenizer(r"\w+\|\S+")
	tokens = tokenizer.tokenize(string)
	return tokens


	with open("comments.json", "r") as file:
	data = json.load(file)

	conll = open(f"comments.conll", "w")

	for docx_file in tqdm(files_list, desc="Total"):
	name = docx_file.split(".")[0].strip()
	text = convert_docx_to_text(f"docx/{docx_file}")

	conll = open(f"comments.conll", "a+")

	for _, delimiter, label, _ in data[name]:
	split_text = split_from_first_match(text, delimiter)
	if split_text:
	out_text, tag_text, text = split_text[0], split_text[1], split_text[2]

	split_out_text = split_string_with_punctuation(out_text)
	split_tag_text = split_string_with_punctuation(tag_text)

	for txt in split_out_text:
	conll.write(f"{txt}\tO\n")

	conll.write(f"{split_tag_text[0]}\tB-{label}\n")
	for txt in split_tag_text[1:]:
	conll.write(f"{txt}\tI-{label}\n")
	else:
	continue

	conll.write("\n")
	conll.close()