ModMaamari/from_label_studio_to_dataframe.py

## from_label_studio_to_dataframe.py
def from_label_studio_to_dataframe( LABEL_STUDIO_URL="",
                                    API_KEY=""):
    '''
    Goals:
        - Load the labeled data from Label Studio
            (or from a raw_data dictionary saved locally as a pickle file), clean it, and save it into a panda data frame
    Attributes:
        - LABEL_STUDIO_URL (url as string): the url for the label studio project you want to get your data from
        - API_KEY (string): your Label Studio API_KEY
    Returns:
        - df (pandas DataFrame): The labeled data as a pandas DataFrame
    '''


    ls = Client(url=LABEL_STUDIO_URL, api_key=API_KEY)
    ls.check_connection()
    pro = project.Project.get_from_id(ls, "1")
    raw_data = project.Project.get_labeled_tasks(pro)


    df = pd.DataFrame(columns=["text", "category", "stage", "level"])

    question_tags = [
        "Question_1_Company_specific", "Question_1_Market_related",
        "Question_2_specific", "Question_2_open",
        "Question_3_attack", "Question_3_support", "Question_3_neutral"
    ]

    answer_tags = [
        "Answer_1_specific", "Answer_1_avoid_excuse",
        "Answer_2_negative", "Answer_2_positive",
        "Answer_3_blame", "Answer_3_no_blame"
    ]

    tag2val = {
        "Question_1_Company_specific": 0,
        "Question_1_Market_related": 1,
        "Question_2_specific": 0,
        "Question_2_open": 1,
        "Question_3_attack": 2,
        "Question_3_support": 0,
        "Question_3_neutral": 1,
        "Answer_1_specific": 0,
        "Answer_1_avoid_excuse": 1,
        "Answer_2_negative": 0,
        "Answer_2_positive": 1,
        "Answer_3_blame": 0,
        "Answer_3_no_blame": 1
    }


    for task in tqdm(raw_data):
        annotations = task["annotations"][0]["result"]

        for ann in annotations:
            text = ann["value"]["text"]
            label = ann["value"]["labels"][0]


            # Ignore any odservation that does not have any of the question and answer stages
            # (ex: an observation that only labels the question QID_13)
            if len(label) < 7:
                continue

            # Get the category "Answer" or "Question" from the label
            category = "Answer" if "Answer" in label else "Question"
            # Get the stage number
            stage = int(label.split("_")[1])
            # Get the level
            level = tag2val[label]
            # Add the observation to the dataframe
            df = df.append(
                {
                    "text": text,
                    "category": category,
                    "stage": stage,
                    "level": level
                },
                ignore_index=True)

    return df
	def from_label_studio_to_dataframe( LABEL_STUDIO_URL="",
	API_KEY=""):
	'''
	Goals:
	- Load the labeled data from Label Studio
	(or from a raw_data dictionary saved locally as a pickle file), clean it, and save it into a panda data frame
	Attributes:
	- LABEL_STUDIO_URL (url as string): the url for the label studio project you want to get your data from
	- API_KEY (string): your Label Studio API_KEY
	Returns:
	- df (pandas DataFrame): The labeled data as a pandas DataFrame
	'''


	ls = Client(url=LABEL_STUDIO_URL, api_key=API_KEY)
	ls.check_connection()
	pro = project.Project.get_from_id(ls, "1")
	raw_data = project.Project.get_labeled_tasks(pro)


	df = pd.DataFrame(columns=["text", "category", "stage", "level"])

	question_tags = [
	"Question_1_Company_specific", "Question_1_Market_related",
	"Question_2_specific", "Question_2_open",
	"Question_3_attack", "Question_3_support", "Question_3_neutral"
	]

	answer_tags = [
	"Answer_1_specific", "Answer_1_avoid_excuse",
	"Answer_2_negative", "Answer_2_positive",
	"Answer_3_blame", "Answer_3_no_blame"
	]

	tag2val = {
	"Question_1_Company_specific": 0,
	"Question_1_Market_related": 1,
	"Question_2_specific": 0,
	"Question_2_open": 1,
	"Question_3_attack": 2,
	"Question_3_support": 0,
	"Question_3_neutral": 1,
	"Answer_1_specific": 0,
	"Answer_1_avoid_excuse": 1,
	"Answer_2_negative": 0,
	"Answer_2_positive": 1,
	"Answer_3_blame": 0,
	"Answer_3_no_blame": 1
	}


	for task in tqdm(raw_data):
	annotations = task["annotations"][0]["result"]

	for ann in annotations:
	text = ann["value"]["text"]
	label = ann["value"]["labels"][0]


	# Ignore any odservation that does not have any of the question and answer stages
	# (ex: an observation that only labels the question QID_13)
	if len(label) < 7:
	continue

	# Get the category "Answer" or "Question" from the label
	category = "Answer" if "Answer" in label else "Question"
	# Get the stage number
	stage = int(label.split("_")[1])
	# Get the level
	level = tag2val[label]
	# Add the observation to the dataframe
	df = df.append(
	{
	"text": text,
	"category": category,
	"stage": stage,
	"level": level
	},
	ignore_index=True)

	return df