Add stats

bigcode-project · Nov 13, 2023 · dc180a6 · dc180a6
1 parent 2689e99
commit dc180a6
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 0 deletions.
diff --git a/evaluation/other/humanevalpack_stats.py b/evaluation/other/humanevalpack_stats.py
@@ -0,0 +1,19 @@
+from datasets import load_dataset
+
+
+
+for lang in ['python', 'js', 'cpp', 'java', 'go', 'rust']:
+    print(f'Language: {lang}')
+    ds = load_dataset('bigcode/humanevalpack', lang, split="test")
+    # Average docstring length
+    print(f'Average docstring length: {sum([len(d) for d in ds["docstring"]]) / len(ds["docstring"])}')
+    # Min docstring length
+    print(f'Min docstring length: {min([len(d) for d in ds["docstring"]])}')
+    # Max docstring length
+    print(f'Max docstring length: {max([len(d) for d in ds["docstring"]])}')
+    # Average solution length
+    print(f'Average solution length: {sum([len(d) for d in ds["canonical_solution"]]) / len(ds["canonical_solution"])}')
+    # Min solution length
+    print(f'Min solution length: {min([len(d) for d in ds["canonical_solution"]])}')
+    # Max solution length
+    print(f'Max solution length: {max([len(d) for d in ds["canonical_solution"]])}')
diff --git a/evaluation/other/nlg_eval.py b/evaluation/other/nlg_eval.py
@@ -0,0 +1,33 @@
+import json
+import nlgeval
+from tqdm import tqdm
+from nlgeval import NLGEval
+
+def get_ref(file_path):
+    ref = []
+    with open(file_path) as f:
+        for line in f:
+            ref.append(json.loads(line)['docstring'])
+    return ref
+
+def get_hyp(file_path):
+    with open(file_path) as f:
+        hyp = json.load(f)
+    return hyp
+
+
+n = NLGEval(no_skipthoughts=True, no_glove=True, metrics_to_omit=['SPICE', 'CIDEr', 'ROUGE_L'])
+
+for lang in ['cpp', 'java', 'go', 'js', 'python', 'rust']:
+    metrics_dicts = []
+    print(f'Language: {lang}')
+    ref = get_ref(f'data/{lang}/data/humanevalpack.jsonl')
+    hyp = get_hyp(f'octocoder/humanevalexplain/generations_humanevalexplaindescribe{lang}_starcoderguanacocommits.json')
+    for i in tqdm(range(len(hyp))):
+        metrics_dicts.append({})
+        for j in range(len(hyp[i])):
+            metrics_dict = n.compute_individual_metrics(ref[i], hyp[i][j])
+            for k in metrics_dict:
+                metrics_dicts[i][k] = max(metrics_dicts[i].get(k, 0), metrics_dict[k])
+    with open(f'octocoder/humanevalexplain/metrics_humanevalexplaindescribe{lang}_starcoderguanacocommits.json', 'w') as f:
+        json.dump(metrics_dicts, f, indent=4)
diff --git a/evaluation/other/nlg_eval_avg.py b/evaluation/other/nlg_eval_avg.py
@@ -0,0 +1,19 @@
+import json
+
+for lang in ['cpp', 'java', 'go', 'js', 'python', 'rust']:
+    print(f'Language: {lang}')
+    with open(f'evaluation/octocoder/humanevalexplain/metrics_humanevalexplaindescribe{lang}_starcoderguanacocommits.json', 'r') as f:
+        data = json.load(f)
+
+    bleu1 = [d['Bleu_1'] * 100 for d in data]
+    bleu2 = [d['Bleu_2'] * 100 for d in data]
+    bleu3 = [d['Bleu_3'] * 100 for d in data]
+    bleu4 = [d['Bleu_4'] * 100 for d in data]
+    meteor = [d['METEOR'] * 100 for d in data]
+
+    # Average
+    print(f'BLEU-1: {sum(bleu1) / len(bleu1)}')
+    print(f'BLEU-2: {sum(bleu2) / len(bleu2)}')
+    print(f'BLEU-3: {sum(bleu3) / len(bleu3)}')
+    print(f'BLEU-4: {sum(bleu4) / len(bleu4)}')
+    print(f'METEOR: {sum(meteor) / len(meteor)}')