Source code for inheritance_explorer.similarity
import abc
import collections
from typing import Any, Optional, OrderedDict
import numpy as np
import numpy.typing as npt
import pycode_similar
[docs]
class ResultsContainer:
def __init__(
self,
count: int,
total: int,
similarity_fraction: float,
base_class: int,
this_class: int,
):
self.count = count
self.total = total
self.similarity_fraction = similarity_fraction
self.base_class = base_class
self.this_class = this_class
_sdict_type = OrderedDict[int, str]
_nested_source_dict = dict[int, OrderedDict[int, ResultsContainer]]
_sim_results_tuple = tuple[
_nested_source_dict,
npt.NDArray[Any],
tuple[int, ...],
]
_single_result = OrderedDict[int, ResultsContainer]
[docs]
class SimilarityContainer(abc.ABC):
_valid_methods: list[str] = ["permute", "reference"]
def __init__(self, method: str = "reference"):
if method not in self._valid_methods:
raise ValueError(
f"Provided method not recognized, must be in {self._valid_methods}"
)
self.method = method
self.results = None # for storing results of similarity tests
[docs]
def run(
self, source_dict: _sdict_type, reference: Optional[Any] = None
) -> _single_result | _sim_results_tuple:
"""
source_dict : dict
dictionary mapping a node identifier to a source code string
"""
source_dict_c = source_dict.copy()
results: _single_result | _sim_results_tuple
if self.method == "permute":
results = self._permute_and_run(source_dict_c)
else:
if reference not in source_dict_c or reference is None:
raise ValueError(
"The the reference parameter must be a key in source_dict"
)
results = self._compare_single_set(source_dict_c, reference)
return results
@abc.abstractmethod
def _permute_and_run(self, source_dict: _sdict_type) -> _sim_results_tuple:
pass
@abc.abstractmethod
def _compare_single_set(
self, source_dict: _sdict_type, reference: Any
) -> _single_result:
pass
[docs]
class PycodeSimilarity(SimilarityContainer):
def _compare_single_set(
self,
source_dict: _sdict_type,
reference: int,
) -> _single_result:
src = source_dict[reference] # extract the reference
# this will result in a self-comparison, but that is OK and makes some
# things easier in _permute_and_run
src_list = [
src,
] + [v for v in source_dict.values()]
similarity = pycode_similar.detect(src_list)
results: _single_result = collections.OrderedDict()
for class_id, sim in zip(source_dict.keys(), similarity):
results[class_id] = ResultsContainer(
count=sim[1][0].plagiarism_count,
total=sim[1][0].total_count,
similarity_fraction=sim[1][0].plagiarism_percent,
base_class=reference,
this_class=class_id,
)
return results
def _permute_and_run(
self, source_dict: OrderedDict[int, str]
) -> _sim_results_tuple:
N = len(source_dict)
similarity_matrix = np.ones((N, N))
results_by_ref: _nested_source_dict = {}
sim_axis = tuple([i for i in source_dict.keys()])
for iref, ref in enumerate(source_dict.keys()):
results = self._compare_single_set(source_dict.copy(), ref)
sim_array = np.array([r.similarity_fraction for r in results.values()])
similarity_matrix[iref, :] = sim_array
results_by_ref[ref] = results
# correct for asymmetry
similarity_matrix = (similarity_matrix.T + similarity_matrix) / 2.0
return results_by_ref, similarity_matrix, sim_axis