corpus2jsonl.py 1.04 KB
import json
import pathlib

cwd = pathlib.Path.cwd()
glob = cwd.glob('Article/*.json')
for idx, json_path in enumerate(glob):
    with json_path.open(encoding='utf-8') as json_file:
        article = json.load(json_file)
        document = article["document"]
        with open(f"jsonl/{article['id']}.jsonl", 'w', encoding='utf-8') as write_file:
            for doc in document:
                metadata = doc["metadata"]
                hyphenated = f"{metadata['date'][:4]}-{metadata['date'][4:6]}-{metadata['date'][6:]}"
                obj = {
                    "author": metadata["author"],
                    "publisher": metadata["publisher"],
                    "date": hyphenated,
                    "topic": metadata["topic"],
                    "paragraph": list(map(lambda line: line["form"], doc["paragraph"]))
                }
                write_file.write(json.dumps(obj, ensure_ascii=False) + '\n')
    if (idx+1) % 100 == 0:
        print(f"Converted {idx+1} files")
else:
    print(f"Finished: total {idx+1} files are converted")