hhlee

대명사 처리

1 -from nltk.tokenize import word_tokenize 1 +from nltk.tokenize import word_tokenize, sent_tokenize
2 import nltk 2 import nltk
3 import re 3 import re
4 from bs4 import BeautifulSoup 4 from bs4 import BeautifulSoup
5 +from newspaper import Article
5 import requests 6 import requests
6 7
7 8
...@@ -78,3 +79,16 @@ def make_brace_triple(target_str, brace_tags): ...@@ -78,3 +79,16 @@ def make_brace_triple(target_str, brace_tags):
78 pred = make_be_verb(subj) 79 pred = make_be_verb(subj)
79 obj = ' '.join([value for value, _ in brace_tags]) 80 obj = ' '.join([value for value, _ in brace_tags])
80 return [subj, pred, obj] 81 return [subj, pred, obj]
82 +
83 +
84 +url = 'https://en.wikipedia.org/wiki/Korea'
85 +
86 +
87 +def get_bodytext_from_url(url):
88 + news = Article(url, language='en')
89 + news.download()
90 + news.parse()
91 + text = news.text
92 + pattern = r'\[[^]]*\]'
93 + text = re.sub(pattern=pattern, repl='', string=text)
94 + return text
......
This diff is collapsed. Click to expand it.
1 +package test_package;
2 +import edu.stanford.nlp.coref.CorefCoreAnnotations;
3 +import edu.stanford.nlp.coref.data.CorefChain;
4 +import edu.stanford.nlp.coref.data.Dictionaries;
5 +import edu.stanford.nlp.coref.data.Mention;
6 +import edu.stanford.nlp.ie.util.RelationTriple;
7 +import edu.stanford.nlp.io.IOUtils;
8 +import edu.stanford.nlp.ling.CoreAnnotations;
9 +import edu.stanford.nlp.ling.CoreLabel;
10 +import edu.stanford.nlp.naturalli.NaturalLogicAnnotations;
11 +import edu.stanford.nlp.naturalli.OpenIE;
12 +import edu.stanford.nlp.naturalli.SentenceFragment;
13 +import edu.stanford.nlp.pipeline.Annotation;
14 +import edu.stanford.nlp.pipeline.CoreDocument;
15 +import edu.stanford.nlp.pipeline.CoreEntityMention;
16 +import edu.stanford.nlp.pipeline.StanfordCoreNLP;
17 +import edu.stanford.nlp.semgraph.SemanticGraph;
18 +import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
19 +import edu.stanford.nlp.tagger.maxent.MaxentTagger;
20 +import edu.stanford.nlp.util.CoreMap;
21 +import edu.stanford.nlp.util.IntPair;
22 +import edu.stanford.nlp.util.PropertiesUtils;
23 +import org.apache.jena.rdf.model.*;
24 +import org.jsoup.Jsoup;
25 +import org.jsoup.nodes.Document;
26 +import org.jsoup.select.Elements;
27 +import java.io.IOException;
28 +import java.util.*;
29 +import java.util.stream.Collectors;
30 +
31 +public class main {
32 + public static void main(String[] args) throws IOException {
33 + //Jsoup 파싱
34 + Document doc = Jsoup.connect("https://en.wikipedia.org/wiki/Korea").get();
35 + Elements pTags = doc.getElementsByTag("p");
36 + String bodyText = Jsoup.parse(pTags.toString()).text();
37 +
38 +// //stanford pos tagger 형태소 분석기를 이용한 태깅
39 +// MaxentTagger tagger = new MaxentTagger("taggers/english-left3words-distsim.tagger");
40 +// String tagged = tagger.tagString(bodyText);
41 +//
42 +// //핵심 고유단어 추출작업
43 +// String[] taggedArr = tagged.split(" ");
44 +// List<String> nnp = Arrays.stream(taggedArr).filter(word -> word.contains("_NNP")).collect(Collectors.toList());
45 +// Hashtable<String,Integer> freqOfWordTable = new Hashtable<>();
46 +// for (String word : nnp) {
47 +// Integer freq = freqOfWordTable.get(word); // 단어를 꺼낸다. word가 key이고 freq가 value
48 +// freqOfWordTable.put(word, (freq == null) ? 1: freq +1);
49 +// }
50 +// List sortedList = sortByValue(freqOfWordTable);
51 +// String coreNoun = sortedList.get(0).toString();
52 +
53 +// //트리플 추출 과정
54 +// String[] sentences = tagged.split("\\._\\.");
55 +// List<String[]> tripples = new ArrayList<>();
56 +// for (String sentence : sentences){
57 +// if (sentence.contains(coreNoun)) {
58 +// String[] words = sentence.split(" ");
59 +// String subject = "";
60 +// String predicate = "";
61 +// String object = "";
62 +// for (String word:words) {
63 +// if(word.equals(coreNoun)) {
64 +// String[] removeTag = word.split("_");
65 +// subject = removeTag[0];
66 +// }else if(word.contains("_VB") && !subject.isEmpty()) {
67 +// String[] removeTag = word.split("_");
68 +// predicate = removeTag[0];
69 +// }else if(word.contains("_NNP") && !predicate.isEmpty()) {
70 +// String[] removeTag = word.split("_");
71 +// object = removeTag[0];
72 +// }
73 +// if(!subject.isEmpty() && !predicate.isEmpty() && !object.isEmpty()){
74 +// String[] tripple = {subject,predicate,object};
75 +// tripples.add(tripple);
76 +// }
77 +// }
78 +// }
79 +// }
80 +
81 +// // Jena로 RDF 추출
82 +// Model model = ModelFactory.createDefaultModel();
83 +// for(String[] statement : tripples){
84 +// Resource s = model.createResource("http://subject/"+statement[0]);
85 +// Property p = model.createProperty("http://predicate/"+statement[1]);
86 +// RDFNode o = model.createLiteral(statement[2]);
87 +//
88 +// if(s.hasProperty(p)){
89 +// s.addProperty(p,model.createResource().addProperty(p,o));
90 +// }else {
91 +// s.addProperty(p,o);
92 +// }
93 +// }
94 +// model.write(System.out);
95 +// //RDFDataMgr.write(System.out, model, Lang.NTRIPLES); // N-TRIPLES 형태로 출력
96 +
97 + // stanford OpenIE
98 + Properties props = PropertiesUtils.asProperties(
99 + "annotators", "tokenize,ssplit,pos,lemma,ner,parse,coref"
100 + );
101 + StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
102 +
103 + String text;
104 + if (args.length > 0) {
105 + text = IOUtils.slurpFile(args[0]);
106 + } else {
107 + text = "Korea (officially the \"Korean Peninsula\") is a region in East Asia. Since 1945 it has been divided into the two parts which soon became the two sovereign states: North Korea (officially the \"Democratic People's Republic of Korea\") and South Korea (officially the \"Republic of Korea\"). Korea consists of the Korean Peninsula, Jeju Island, and several minor islands near the peninsula. It is bordered by China to the northwest and Russia to the northeast. It is separated from Japan to the east by the Korea Strait and the Sea of Japan (East Sea). During the first half of the 1st millennium, Korea was divided between the three competing states of Goguryeo, Baekje, and Silla, together known as the Three Kingdoms of Korea.";
108 + //text = bodyText;
109 + }
110 +
111 +
112 + Annotation docu = new Annotation(text);
113 + pipeline.annotate(docu);
114 + List<String> sentList = new ArrayList<>();
115 + for (CoreMap sentence : docu.get(CoreAnnotations.SentencesAnnotation.class)) {
116 + sentList.add(sentence.get(CoreAnnotations.TextAnnotation.class));
117 + }
118 +
119 +
120 +
121 + String newText = "";
122 + Collection<CorefChain> values = docu.get(CorefCoreAnnotations.CorefChainAnnotation.class).values();
123 + for (CorefChain cc : values) {
124 + //System.out.println("\t" + cc.getMentionsInTextualOrder());
125 + List<CorefChain.CorefMention> mentionsInTextualOrder = cc.getMentionsInTextualOrder();
126 + String coreWord = "";
127 + for (int i = 0; i < mentionsInTextualOrder.size(); i++){
128 + if (i == 0){
129 + coreWord = mentionsInTextualOrder.get(i).mentionSpan; // 첫번째 명사를 원래 명사로 지정
130 + }
131 + String mention = mentionsInTextualOrder.get(i).mentionSpan; // 대명사 가져오기
132 + int sentNum = mentionsInTextualOrder.get(i).sentNum -1; //문장 번호 가져오기
133 + String modiSent = sentList.get(sentNum); // 수정될 문장 가져오고
134 + modiSent = modiSent.replaceAll(mention,coreWord); // mention(대명사를) coreWord(원래단어)로 바꿔주고
135 + sentList.set(sentNum,modiSent); // 수정된 문자열로 바꿔줌
136 + }
137 + }
138 +
139 + //System.out.println(sentList);
140 +
141 + for (String s : sentList) {
142 + newText += s + " ";
143 + }
144 + System.out.println(text);
145 + System.out.println("--------------------------------------------");
146 + System.out.println(newText);
147 +
148 + System.out.println("\n \n");
149 +
150 +
151 + props = PropertiesUtils.asProperties(
152 + "annotators", "tokenize,ssplit,pos,lemma,parse,natlog,openie"
153 + );
154 + props.setProperty("openie.max_entailments_per_clause","100");
155 + props.setProperty("openie.triple.strict","false");
156 + pipeline = new StanfordCoreNLP(props);
157 +
158 + docu = new Annotation(newText);
159 + pipeline.annotate(docu);
160 + int sentNo = 0;
161 + for (CoreMap sentence : docu.get(CoreAnnotations.SentencesAnnotation.class)) {
162 + System.out.println("Sentence #" + ++sentNo + ": " + sentence.get(CoreAnnotations.TextAnnotation.class));
163 +
164 +// // Print SemanticGraph
165 +// System.out.println(sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class).toString(SemanticGraph.OutputFormat.LIST));
166 +
167 + // Get the OpenIE triples for the sentence
168 + Collection<RelationTriple> triples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
169 +
170 + // Print the triples
171 + for (RelationTriple triple : triples) {
172 + System.out.println(triple.confidence + "\t" +
173 + "<"+triple.subjectGloss()+">" + "\t" +
174 + "<"+triple.relationGloss()+">" + "\t" +
175 + "<"+triple.objectGloss()+">");
176 + }
177 + System.out.println("\n");
178 + // Alternately, to only run e.g., the clause splitter:
179 +// List<SentenceFragment> clauses = new OpenIE(props).clausesInSentence(sentence);
180 +// for (SentenceFragment clause : clauses) {
181 +// System.out.println(clause.parseTree.toString(SemanticGraph.OutputFormat.LIST));
182 +// }
183 +// System.out.println();
184 + }
185 +
186 + }
187 + //map 정렬 메소드
188 + public static List sortByValue(final Map map) {
189 + List<String> list = new ArrayList();
190 + list.addAll(map.keySet());
191 + Collections.sort(list,new Comparator() {
192 + public int compare(Object o1,Object o2) {
193 + Object v1 = map.get(o1);
194 + Object v2 = map.get(o2);
195 + return ((Comparable) v2).compareTo(v1);
196 + }
197 + });
198 + //Collections.reverse(list); // 주석시 오름차순
199 + return list;
200 + }
201 +}
1 +model = /u/nlp/data/pos-tagger/models-4.0.0/models/english-left3words-distsim-prod1.tagger
2 +arch = left3words,wordshapes(-1,1),distsim(/u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters,-1,1),distsimconjunction(/u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters,-1,1),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorUCase),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorCNumber),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorDash),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorLetterDigitDash),rareExtractor(edu.stanford.nlp.tagger.maxent.CompanyNameDetector),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorAllCapitalized),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorUpperDigitDash),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorStartSentenceCap),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorMidSentenceCapC),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorMidSentenceCap),prefix(10),suffix(10),unicodeshapes(0),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorNonAlphanumeric)
3 +wordFunction = edu.stanford.nlp.process.AmericanizeFunction
4 +trainFile = /u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/craft-train.txt;/u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/ewt-train.txt;/u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/questionbank-train.txt;/u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/train-currency.txt;/u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/handparsed-train.txt;/u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/ontonotes-train.txt;/u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/wsj-train.txt;/u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/train-tech-english.txt
5 +closedClassTags =
6 +closedClassTagThreshold = 40
7 +curWordMinFeatureThresh = 2
8 +debug = false
9 +debugPrefix =
10 +tagSeparator = _
11 +encoding = UTF-8
12 +iterations = 100
13 +lang = english
14 +learnClosedClassTags = false
15 +minFeatureThresh = 2
16 +openClassTags =
17 +rareWordMinFeatureThresh = 5
18 +rareWordThresh = 5
19 +search = owlqn
20 +sgml = false
21 +sigmaSquared = 0.5
22 +regL1 = 0.75
23 +tagInside =
24 +tokenize = true
25 +tokenizerFactory =
26 +tokenizerOptions =
27 +verbose = false
28 +verboseResults = true
29 +veryCommonWordThresh = 250
30 +xmlInput =
31 +outputFile =
32 +outputFormat = slashTags
33 +outputFormatOptions =
34 +nthreads = 1
35 +minWordsLockTags = 1
1 +<?xml version="1.0" encoding="UTF-8"?>
2 +<module type="JAVA_MODULE" version="4">
3 + <component name="NewModuleRootManager" inherit-compiler-output="true">
4 + <exclude-output />
5 + <content url="file://$MODULE_DIR$">
6 + <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
7 + </content>
8 + <orderEntry type="inheritedJdk" />
9 + <orderEntry type="sourceFolder" forTests="false" />
10 + <orderEntry type="module-library" exported="">
11 + <library>
12 + <CLASSES>
13 + <root url="jar://$USER_HOME$/Desktop/stanford-postagger-full-2020-11-17/stanford-postagger-4.2.0.jar!/" />
14 + </CLASSES>
15 + <JAVADOC />
16 + <SOURCES />
17 + </library>
18 + </orderEntry>
19 + <orderEntry type="module-library" exported="">
20 + <library>
21 + <CLASSES>
22 + <root url="jar://$USER_HOME$/Desktop/stanford-postagger-full-2020-11-17/jsoup-1.13.1.jar!/" />
23 + </CLASSES>
24 + <JAVADOC />
25 + <SOURCES />
26 + </library>
27 + </orderEntry>
28 + <orderEntry type="library" exported="" name="lib" level="project" />
29 + </component>
30 +</module>
...\ No newline at end of file ...\ No newline at end of file