김민수

Upload preparation scripts

1 +import json
2 +import pathlib
3 +
4 +cwd = pathlib.Path.cwd()
5 +glob = cwd.glob('Article/*.json')
6 +for idx, json_path in enumerate(glob):
7 + with json_path.open(encoding='utf-8') as json_file:
8 + article = json.load(json_file)
9 + document = article["document"]
10 + with open(f"jsonl/{article['id']}.jsonl", 'w', encoding='utf-8') as write_file:
11 + for doc in document:
12 + metadata = doc["metadata"]
13 + hyphenated = f"{metadata['date'][:4]}-{metadata['date'][4:6]}-{metadata['date'][6:]}"
14 + obj = {
15 + "author": metadata["author"],
16 + "publisher": metadata["publisher"],
17 + "date": hyphenated,
18 + "topic": metadata["topic"],
19 + "paragraph": list(map(lambda line: line["form"], doc["paragraph"]))
20 + }
21 + write_file.write(json.dumps(obj, ensure_ascii=False) + '\n')
22 + if (idx+1) % 100 == 0:
23 + print(f"Converted {idx+1} files")
24 +else:
25 + print(f"Finished: total {idx+1} files are converted")
26 +
1 +from pyarrow import json
2 +import pyarrow.parquet as pq
3 +import pathlib
4 +from os import makedirs
5 +
6 +cwd = pathlib.Path.cwd()
7 +for idx, json_path in enumerate(cwd.glob('jsonl/*.jsonl')):
8 + with json_path.open('rb') as json_file:
9 + table = json.read_json(json_file)
10 + makedirs(f"parquet/{json_path.stem[:6]}", exist_ok=True)
11 + pq.write_table(table, f"parquet/{json_path.stem[:6]}/{json_path.stem}.parquet")
12 + if (idx+1) % 100 == 0:
13 + print(f"Converted {idx+1} files")
14 +else:
15 + print(f"Finished: total {idx+1} files are converted")
1 +import json
2 +import pathlib
3 +
4 +cwd = pathlib.Path.cwd()
5 +glob = cwd.glob('Article/NWRW18*.json')
6 +rows = []
7 +for idx, json_path in enumerate(glob):
8 + with json_path.open(encoding='utf-8') as json_file:
9 + article = json.load(json_file)
10 + document = article["document"]
11 + for doc in document:
12 + metadata = doc["metadata"]
13 + obj = {
14 + "publisher": metadata["publisher"],
15 + "topic": metadata["topic"],
16 + "original_topic": metadata["original_topic"]
17 + }
18 + rows.append(obj)
19 + if(idx+1) % 100 == 0:
20 + print(idx+1)
21 +with open('topic_to_predict.json', 'w', encoding='utf-8') as dataset:
22 + json.dump(rows, dataset, ensure_ascii=False)
23 +print("Finished")
...\ No newline at end of file ...\ No newline at end of file
1 +import pyarrow.parquet as pq
2 +import pandas as pd
3 +import matplotlib.pyplot as plt
4 +category='NIRW19'
5 +table = pq.read_pandas(f"parquet/{category}", columns=['author', 'publisher', 'date', 'topic'])
6 +print(table.schema)
7 +df = table.to_pandas()
8 +
9 +publisher=df.groupby('publisher').count()['author']
10 +topic = df.groupby('topic').count()['author']
11 +month = df.groupby(pd.Grouper(key='date', freq='M')).count()['author']
12 +plt.rc('font', family='BATANG')
13 +pub_ax = publisher.plot(kind="bar")
14 +plt.title(f"{category}-Publisher")
15 +for i, v in enumerate(publisher):
16 + pub_ax.text(i-0.25, v, v)
17 +plt.show()
18 +plt.title(f"{category}-Topic")
19 +top_ax=topic.plot(kind="bar")
20 +for i, v in enumerate(topic):
21 + top_ax.text(i-0.25, v, v)
22 +plt.show()
23 +plt.title(f"{category}-Month")
24 +month.plot()
25 +plt.show()
26 +plt.title(f"{category}-Publisher")
27 +publisher.plot.pie(autopct='%1.1f%%')
28 +plt.show()
29 +plt.title(f"{category}-Topic")
30 +topic.plot.pie(autopct='%1.1f%%')
31 +plt.show()
...\ No newline at end of file ...\ No newline at end of file
This diff is collapsed. Click to expand it.