ValueError: Shape of passed values is (x,x), indices imply (x,x )

  itertools, nlp, python, scipy

I have this script that create a cooc matrix based on a list of keywords and a list of documents.

import numpy as np
import itertools
from scipy.sparse import csr_matrix
from fuzzywuzzy import fuzz
import pandas as pd
import time

def matrix(keywords, documents):
    word_to_id = dict(zip(keywords, range(len(keywords))))
    doc_as_id = []
    for doc in documents:
        x = []
        for word in doc:
            for chars in charList:
                for char in chars:
                    name_char = "".join(char).split()
                    ratios = []
                    for name in name_char:
                        #print("Keyword :", name)
                        ratio = fuzz.token_set_ratio(word.lower(), name.lower())
                        ratios.append(ratio)
                    if any(y > 90 for y in ratios):
                        x.append(word_to_id[chars[0]])
        res = []
        [res.append(j) for j in x if j not in res]
        doc_as_id.append(np.sort(res).astype("uint32"))
    row_ind, col_ind = zip(*itertools.chain(*[[(i, w) for w in doc] for i, doc in enumerate(doc_as_id)]))
    data = np.ones(len(row_ind), dtype="uint32")
    max_word_id = max(itertools.chain(*doc_as_id)) + 1
    doc_word_matrix = csr_matrix((data, (row_ind, col_ind)), shape=(len(doc_as_id), max_word_id))
    cooc_matrix = doc_word_matrix.T * doc_word_matrix
    cooc_matrix.setdiag(0)
    return cooc_matrix


charList = ["Ali,Gaby Hoffmann", "Sarah,Amy Landecker", "Maura,Jeffrey Tambor", ""Shelly,Judith Light", "Josh,Jay Duplass"]
documents = ["Ali: Also, I've realized I just...I can't have real emotional intimacy with somebody who hasn't suffered under patriarchy./Sarah: [scoffs]/Ali: No, I'm serious./Sarah: Oh, God./Ali: All these years it's just been.../Sarah: You're a lesbian.... because I don't give two fucks about patriarchy."]

texts = []
for list in documents:
    texts.append(str(list[0]).split())

matrixLabel = []
for chars in charList:
    matrixLabel.append(chars[0])

cooc_matrix = matrix(matrixLabel, texts)
matrix = pd.DataFrame(cooc_matrix.todense(), index=matrixLabel, columns=matrixLabel)
print(matrix)

It works well if the number of keywords is less than the number of documents. However, I get this error when the number of keywords is greater which is a problem because in my case it will always be :

Traceback (most recent call last):
  File "C:/Users/p1058372/Desktop/yannick/cinemaQueer/tumblr/analysis/content_analysis.py", line 105, in <module>
    matrix = pd.DataFrame(cooc_matrix.todense(), index=matrixLabel, columns=matrixLabel)
  File "C:Usersp1058372Miniconda3envsmtl_gentrilibsite-packagespandascoreframe.py", line 497, in __init__
    mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
  File "C:Usersp1058372Miniconda3envsmtl_gentrilibsite-packagespandascoreinternalsconstruction.py", line 234, in init_ndarray
    return create_block_manager_from_blocks(block_values, [columns, index])
  File "C:Usersp1058372Miniconda3envsmtl_gentrilibsite-packagespandascoreinternalsmanagers.py", line 1672, in create_block_manager_from_blocks
    raise construction_error(tot_items, blocks[0].shape[1:], axes, e)
ValueError: Shape of passed values is (4, 4), indices imply (5, 5)

Is there a way to fix this problem ?

Source: Python Questions

LEAVE A COMMENT