Showing
5 changed files
with
95 additions
and
0 deletions
code/preparation/corpus2jsonl.py
0 → 100644
| 1 | +import json | ||
| 2 | +import pathlib | ||
| 3 | + | ||
| 4 | +cwd = pathlib.Path.cwd() | ||
| 5 | +glob = cwd.glob('Article/*.json') | ||
| 6 | +for idx, json_path in enumerate(glob): | ||
| 7 | + with json_path.open(encoding='utf-8') as json_file: | ||
| 8 | + article = json.load(json_file) | ||
| 9 | + document = article["document"] | ||
| 10 | + with open(f"jsonl/{article['id']}.jsonl", 'w', encoding='utf-8') as write_file: | ||
| 11 | + for doc in document: | ||
| 12 | + metadata = doc["metadata"] | ||
| 13 | + hyphenated = f"{metadata['date'][:4]}-{metadata['date'][4:6]}-{metadata['date'][6:]}" | ||
| 14 | + obj = { | ||
| 15 | + "author": metadata["author"], | ||
| 16 | + "publisher": metadata["publisher"], | ||
| 17 | + "date": hyphenated, | ||
| 18 | + "topic": metadata["topic"], | ||
| 19 | + "paragraph": list(map(lambda line: line["form"], doc["paragraph"])) | ||
| 20 | + } | ||
| 21 | + write_file.write(json.dumps(obj, ensure_ascii=False) + '\n') | ||
| 22 | + if (idx+1) % 100 == 0: | ||
| 23 | + print(f"Converted {idx+1} files") | ||
| 24 | +else: | ||
| 25 | + print(f"Finished: total {idx+1} files are converted") | ||
| 26 | + |
code/preparation/jsonl2parquet.py
0 → 100644
| 1 | +from pyarrow import json | ||
| 2 | +import pyarrow.parquet as pq | ||
| 3 | +import pathlib | ||
| 4 | +from os import makedirs | ||
| 5 | + | ||
| 6 | +cwd = pathlib.Path.cwd() | ||
| 7 | +for idx, json_path in enumerate(cwd.glob('jsonl/*.jsonl')): | ||
| 8 | + with json_path.open('rb') as json_file: | ||
| 9 | + table = json.read_json(json_file) | ||
| 10 | + makedirs(f"parquet/{json_path.stem[:6]}", exist_ok=True) | ||
| 11 | + pq.write_table(table, f"parquet/{json_path.stem[:6]}/{json_path.stem}.parquet") | ||
| 12 | + if (idx+1) % 100 == 0: | ||
| 13 | + print(f"Converted {idx+1} files") | ||
| 14 | +else: | ||
| 15 | + print(f"Finished: total {idx+1} files are converted") |
code/preparation/make_topic_dataset.py
0 → 100644
| 1 | +import json | ||
| 2 | +import pathlib | ||
| 3 | + | ||
| 4 | +cwd = pathlib.Path.cwd() | ||
| 5 | +glob = cwd.glob('Article/NWRW18*.json') | ||
| 6 | +rows = [] | ||
| 7 | +for idx, json_path in enumerate(glob): | ||
| 8 | + with json_path.open(encoding='utf-8') as json_file: | ||
| 9 | + article = json.load(json_file) | ||
| 10 | + document = article["document"] | ||
| 11 | + for doc in document: | ||
| 12 | + metadata = doc["metadata"] | ||
| 13 | + obj = { | ||
| 14 | + "publisher": metadata["publisher"], | ||
| 15 | + "topic": metadata["topic"], | ||
| 16 | + "original_topic": metadata["original_topic"] | ||
| 17 | + } | ||
| 18 | + rows.append(obj) | ||
| 19 | + if(idx+1) % 100 == 0: | ||
| 20 | + print(idx+1) | ||
| 21 | +with open('topic_to_predict.json', 'w', encoding='utf-8') as dataset: | ||
| 22 | + json.dump(rows, dataset, ensure_ascii=False) | ||
| 23 | +print("Finished") | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
code/preparation/stastics.py
0 → 100644
| 1 | +import pyarrow.parquet as pq | ||
| 2 | +import pandas as pd | ||
| 3 | +import matplotlib.pyplot as plt | ||
| 4 | +category='NIRW19' | ||
| 5 | +table = pq.read_pandas(f"parquet/{category}", columns=['author', 'publisher', 'date', 'topic']) | ||
| 6 | +print(table.schema) | ||
| 7 | +df = table.to_pandas() | ||
| 8 | + | ||
| 9 | +publisher=df.groupby('publisher').count()['author'] | ||
| 10 | +topic = df.groupby('topic').count()['author'] | ||
| 11 | +month = df.groupby(pd.Grouper(key='date', freq='M')).count()['author'] | ||
| 12 | +plt.rc('font', family='BATANG') | ||
| 13 | +pub_ax = publisher.plot(kind="bar") | ||
| 14 | +plt.title(f"{category}-Publisher") | ||
| 15 | +for i, v in enumerate(publisher): | ||
| 16 | + pub_ax.text(i-0.25, v, v) | ||
| 17 | +plt.show() | ||
| 18 | +plt.title(f"{category}-Topic") | ||
| 19 | +top_ax=topic.plot(kind="bar") | ||
| 20 | +for i, v in enumerate(topic): | ||
| 21 | + top_ax.text(i-0.25, v, v) | ||
| 22 | +plt.show() | ||
| 23 | +plt.title(f"{category}-Month") | ||
| 24 | +month.plot() | ||
| 25 | +plt.show() | ||
| 26 | +plt.title(f"{category}-Publisher") | ||
| 27 | +publisher.plot.pie(autopct='%1.1f%%') | ||
| 28 | +plt.show() | ||
| 29 | +plt.title(f"{category}-Topic") | ||
| 30 | +topic.plot.pie(autopct='%1.1f%%') | ||
| 31 | +plt.show() | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
code/preparation/sys_sampling_test.ipynb
0 → 100644
This diff is collapsed. Click to expand it.
-
Please register or login to post a comment