본문 바로가기

일기

프로젝트 일기)크롤링 기능 통합 + 다중쿼리

문제해결확인

문제1)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import re
import requests
from bs4 import BeautifulSoup
import numpy as np
 
# get_news_data() 함수는 동일하게 유지
 
titles, contents = get_news_data()
 
# contents를 NumPy 배열로 변환
contents_array = np.array(contents)
 
print("titlesbuildingRegister:", titles[0])
print("titlesregisteredCopy:", titles[1])
print("titlesidentification:", titles[2])
print("titlespowerOfAttorney:", titles[3])
print("titlesCertificateThatDoesNotExist:", titles[4])
 
print("contentsbuildingRegister:", contents_array[0])
print("contentsregisteredCopy:", contents_array[1])
print("contentsidentification:", contents_array[2])
print("contentspowerOfAttorney:", contents_array[3])
print("contentsCertificateThatDoesNotExist:", contents_array[4])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[85], line 7
      4 import numpy as np
      6 # contents를 NumPy 배열로 변환
----> 7 contents_array = np.array(contents)
      9 print("titlesbuildingRegister:", titles[0])
     10 print("titlesregisteredCopy:", titles[1])
 
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (5,) + inhomogeneous part.
cs

 

문제해결확인

문제2)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import re
import requests
from bs4 import BeautifulSoup
import numpy as np
from konlpy.tag import Okt
from textrank import KeysentenceSummarizer
 
# get_news_data() 함수는 동일하게 유지
titles, contents = get_news_data()
 
# contents의 모든 원소들의 길이를 가장 큰 원소와 동일하게 맞춤
max_len = max(len(c) for c in contents)
contents_padded = [np.pad(c, (0, max_len - len(c)), 'constant'for c in contents]
 
# contents_padded를 NumPy 배열로 변환
contents_array = np.array(contents_padded)
 
# 토큰화 함수 정의
def okt_tokenizer(sent):
    okt = Okt()
    words = okt.nouns(sent)
    return words
 
# KeysentenceSummarizer 생성
summarizer = KeysentenceSummarizer(tokenize=okt_tokenizer, min_sim=0.3, verbose=False)
 
# 요약 생성
summaries = []
for content in contents_array:
    summary = summarizer.summarize([content], topk=3)
    summaries.append(summary)
 
 
# 요약 결과 출력
for i, summary in enumerate(summaries):
    print(f"Summary for text {i + 1}:")
    for sentence in summary:
        print(sentence)
    print()
 
 
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Cell In[98], line 30
     28 summaries = []
     29 for content in contents_array:
---> 30     summary = summarizer.summarize([content], topk=3)
     31     summaries.append(summary)
     34 # 요약 결과 출력
 
File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\textrank\summarizer.py:194in KeysentenceSummarizer.summarize(self, sents, topk, bias)
    191 elif bias is not None:
    192     raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias)))
--> 194 self.train_textrank(sents, bias)
    195 idxs = self.R.argsort()[-topk:]
    196 keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)]
 
File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\textrank\summarizer.py:157in KeysentenceSummarizer.train_textrank(self, sents, bias)
    143 def train_textrank(self, sents, bias=None):
    144     """
    145     Arguments
    146     ---------
   (...)
    155     None
    156     """
--> 157     g = sent_graph(sents, self.tokenize, self.min_count,
    158         self.min_sim, self.similarity, self.vocab_to_idx, self.verbose)
    159     self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1)
    160     if self.verbose:
 
File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\textrank\sentence.py:42in sent_graph(sents, tokenize, min_count, min_sim, similarity, vocab_to_idx, verbose)
     14 """
     15 Arguments
     16 ---------
   (...)
     38     shape = (n sents, n sents)
     39 """
     41 if vocab_to_idx is None:
---> 42     idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)
     43 else:
     44     idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])]
 
File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\textrank\utils.py:24in scan_vocabulary(sents, tokenize, min_count)
      6 def scan_vocabulary(sents, tokenize=None, min_count=2):
      7     """
      8     Arguments
      9     ---------
   (...)
     22         Vocabulary to index mapper.
     23     """
---> 24     counter = Counter(w for sent in sents for w in tokenize(sent))
     25     counter = {w:c for w,c in counter.items() if c >= min_count}
     26     idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])]
 
File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\collections\__init__.py:552in Counter.__init__(self, iterable, **kwds)
    541 '''Create a new, empty Counter object.  And if given, count elements
    542 from an input iterable.  Or, initialize the count from another mapping
    543 of elements to their counts.
   (...)
    549 
    550 '''
    551 super(Counter, self).__init__()
--> 552 self.update(iterable, **kwds)
 
File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\collections\__init__.py:637in Counter.update(self, iterable, **kwds)
    635             super(Counter, self).update(iterable) # fast path when counter is empty
    636     else:
--> 637         _count_elements(self, iterable)
    638 if kwds:
    639     self.update(kwds)
 
File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\textrank\utils.py:24in (.0)
      6 def scan_vocabulary(sents, tokenize=None, min_count=2):
      7     """
      8     Arguments
      9     ---------
   (...)
     22         Vocabulary to index mapper.
     23     """
---> 24     counter = Counter(w for sent in sents for w in tokenize(sent))
     25     counter = {w:c for w,c in counter.items() if c >= min_count}
     26     idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])]
 
Cell In[98], line 21in okt_tokenizer(sent)
     19 def okt_tokenizer(sent):
     20     okt = Okt()
---> 21     words = okt.nouns(sent)
     22     return words
 
File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\konlpy\tag\_okt.py:83in Okt.nouns(self, phrase)
     80 def nouns(self, phrase):
     81     """Noun extractor."""
---> 83     tagged = self.pos(phrase)
     84     return [s for s, t in tagged if t == 'Noun']
 
File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\konlpy\tag\_okt.py:69in Okt.pos(self, phrase, norm, stem, join)
     58 def pos(self, phrase, norm=False, stem=False, join=False):
     59     """POS tagger.
     60     In contrast to other classes in this subpackage,
     61     this POS tagger doesn't have a `flatten` option,
   (...)
     67     :param join: If True, returns joined sets of morph and tag.
     68     """
---> 69     validate_phrase_inputs(phrase)
     71     tokens = self.jki.tokenize(
     72                 phrase,
     73                 jpype.java.lang.Boolean(norm),
     74                 jpype.java.lang.Boolean(stem)).toArray()
     75     if join:
 
File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\konlpy\tag\_common.py:20in validate_phrase_inputs(phrase)
     14 """validate if phrase input is provided in str format
     15 
     16 Args:
     17     phrase (str): phrase input
     18 """
     19 msg = "phrase input should be string, not %s" % type(phrase)
---> 20 assert isinstance(phrase, basestring), msg
 
AssertionError: phrase input should be string, not 
cs

 

문제해결확인

문제 3)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# MySQL 연결 정보
HOSTNAME = 'project-db-stu.ddns.net'
PORT = 3307
DATABASE = 'smhrd_A_5'
USERNAME = 'smhrd_A_5'
PASSWORD = 'smhrd5'
list_of_lists = [buildingRegister, registeredCopy, identification, powerOfAttorney, CertificateThatDoesNotExist]
 
# MySQL 연결 엔진 생성
CONNECTION_STRING = f'mysql+pymysql://{USERNAME}:{PASSWORD}@{HOSTNAME}:{PORT}/{DATABASE}?charset=utf8mb4'
engine = create_engine(CONNECTION_STRING, echo=False)
 
#데이터프레임으로 변환
for i in range(list_of_lists):
if buildingRegister == list_of_lists:
    buildingRegister = pd.DataFrame({'buildingRegister': results}, index=[f"file_{i}" for i in buildingRegister])
    buildingRegister.to_sql('newsSummary', con=engine, if_exists='replace',index=False)
elif registeredCopy == list_of_lists:
    registeredCopy = pd.DataFrame({'identification': results}, index=[f"file_{i}" for i in registeredCopy])
    registeredCopy.to_sql('newsSummary', con=engine, if_exists='replace',index=False)
elif identification == list_of_lists:
    identification = pd.DataFrame({'identification': results}, index=[f"file_{i}" for i in identification])
    identification.to_sql('newsSummary', con=engine, if_exists='replace',index=False)
elif powerOfAttorney == list_of_lists:
    powerOfAttorney = pd.DataFrame({'identification': results}, index=[f"file_{i}" for i in powerOfAttorney])
    powerOfAttorney.to_sql('newsSummary', con=engine, if_exists='replace',index=False)
elif CertificateThatDoesNotExist == list_of_lists:
    CertificateThatDoesNotExist = pd.DataFrame({'identification': results}, index=[f"file_{i}" for i in CertificateThatDoesNotExist])
    CertificateThatDoesNotExist.to_sql('newsSummary', con=engine, if_exists='replace',index=False)
 
Cell In[201], line 15
    if buildingRegister == list_of_lists:
    ^
IndentationError: expected an indented block
cs

들여쓰기 오류


문제해결확인

문제4)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def web_crawler(search_terms, start_page, end_page):
    # 크롤링시 필요한 라이브러리 불러오기
    from bs4 import BeautifulSoup
    import requests
    import time
    import schedule
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from webdriver_manager.chrome import ChromeDriverManager
    import urllib.parse
    
    # 웹드라이버 설정
    options = webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension"False)
    
    # 각 검색어에 대한 결과를 저장할 리스트 생성
    buildingRegister = []
    registeredCopy = []
    identification = []
    powerOfAttorney = []
    CertificateThatDoesNotExist = []
    search_results = []
 
    search_results_dict = {}
 
    def makeUrl(search, page, page2):
        search_urls = []
        base_url = "https://search.naver.com/search.naver?where=news&query="
        search = urllib.parse.quote(search)
        for p in range(page, page2 + 1):
            start = (p - 1* 10 + 1
            search_url = base_url + search + "&start=" + str(start)
            search_urls.append(search_url)
        return search_urls
    #(부동산 건축물대장 사기피해,부동산 등기부등본 사기피해, 부동산 신분증 사기피해,부동산 위임장 사기피해, 후견등기사항 부존재증명서 피해)
    #검색어 입력
 
    search_terms = [
        "부동산 건축물대장 사기피해",
        "부동산 등기부등본 사기피해",
        "부동산 신분증 사기피해",
        "부동산 위임장 사기피해",
        "후견등기사항 부존재증명서 피해",
    ]
 
    start_page = 1
    end_page = 2
 
    def job(search):
        
        
        #검색 시작할 페이지 입력
        page = int(1)
        #page = int(input("\n크롤링할 시작 페이지를 입력해주세요. ex)1(숫자만입력):"))
        print("\n크롤링할 시작 페이지: ",page,"페이지")   
        #검색 종료할 페이지 입력
    
        page2 = int(2)
        #page2 = int(input("\n크롤링할 종료 페이지를 입력해주세요. ex)1(숫자만입력):"))
        print("\n크롤링할 종료 페이지: ",page2,"페이지")   
    
        # naver url 생성
        search_urls = makeUrl(search, page, page2)
    
        ## selenium으로 navernews만 뽑아오기##
        # 버전에 상관 없이 os에 설치된 크롬 브라우저 사용
        driver = webdriver.Chrome(ChromeDriverManager().install())
        driver.implicitly_wait(3)
    
        # selenium으로 검색 페이지 불러오기 #
        naver_urls=[]
    
        for i in search_urls:
            driver.get(i)
            time.sleep(1#대기시간 변경 가능
    
            # 네이버 기사 눌러서 제목 및 본문 가져오기#
            # 네이버 기사가 있는 기사 css selector 모아오기
            a = driver.find_elements(By.CSS_SELECTOR,'a.info')
    
            # 위에서 생성한 css selector list 하나씩 클릭하여 본문 url얻기
            for i in a:
                i.click()
    
                # 현재탭에 접근
                driver.switch_to.window(driver.window_handles[1])
                time.sleep(3#대기시간 변경 가능
    
                # 네이버 뉴스 url만 가져오기
                url = driver.current_url
                print(url)
    
                if "news.naver.com" in url:
                    naver_urls.append(url)
                else:
                    pass
                # 정보 크롤링하는 코드
                
                # 현재 탭 닫기
                driver.close()
    
                # 다시처음 탭으로 돌아가기(매우 중요!!!)
                driver.switch_to.window(driver.window_handles[0])
    
        search_results_dict[search] = naver_urls
    
    
    # 'job' 함수를 처음 호출하고 스케줄 설정
    for search in search_terms:
        job(search)
 
web_crawler(search_terms, start_page, end_page)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[262], line 1
----> 1 web_crawler(search_terms, start_page, end_page)
 
NameError: name 'start_page' is not defined
 
 
cs

문제 확인

해결1)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import re
import requests
from bs4 import BeautifulSoup
import numpy as np
 
# get_news_data() 함수는 동일하게 유지
titles, contents = get_news_data()
 
# contents의 모든 원소들의 길이를 가장 큰 원소와 동일하게 맞춤
max_len = max(len(c) for c in contents)
contents_padded = [np.pad(c, (0, max_len - len(c)), 'constant'for c in contents]
 
# contents_padded를 NumPy 배열로 변환
contents_array = np.array(contents_padded)
 
 
print("contentsbuildingRegister:", contents_array[0])
print("contentsregisteredCopy:", contents_array[1])
print("contentsidentification:", contents_array[2])
print("contentspowerOfAttorney:", contents_array[3])
print("contentsCertificateThatDoesNotExist:", contents_array[4])
 
cs

NumPy 배열은 모든 원소의 타입이 동일해야 하므로, 서로 다른 길이의 문자열을 원소로 가진 리스트를 배열로 변환하면 ValueError가 발생합니다. 따라서 contents 리스트의 원소들의 길이를 모두 동일하게 맞춰줘야 합니다. 길이가 가장 큰 문자열을 기준으로 모든 문자열을 동일한 길이로 맞추고, 이후 배열로 변환하면 됩니다.

해결2)

문제 확인

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import numpy as np
from konlpy.tag import Okt
from textrank import KeysentenceSummarizer
 
def summarize_contents(contents_list_of_lists):
    # NumPy 배열로 변환
    results_array = [np.array(contents) for contents in contents_list_of_lists]
 
    # 요약 생성
    all_summaries = []
    for contents in results_array:
        summaries = []
        for content in contents:
            try:
                summary = summarize(content, ratio=0.5)  # summarize 라이브러리를 사용하여 요약 생성
                summary = summary[:300]  # 요약 결과를 300자로 자르기
                summaries.append(summary)
            except Exception as e:
                print(f"Error summarizing text: {e}")
                summaries.append(None)
        all_summaries.append(summaries)
 
    buildingRegister, registeredCopy, identification, powerOfAttorney, CertificateThatDoesNotExist = all_summaries
 
    # 요약 결과 출력
    for i, summaries in enumerate(all_summaries):
        print(f"Summaries for contents_list_of_lists {i + 1}:")
        for j, summary in enumerate(summaries):
            if summary:
                print(f"  Gensim Summary for text {j + 1}:")
                print(f"    {summary}")
                print()
 
    return buildingRegister, registeredCopy, identification, powerOfAttorney, CertificateThatDoesNotExist
 
# 예제 실행
buildingRegister, registeredCopy, identification, powerOfAttorney, CertificateThatDoesNotExist = summarize_contents(contents_list_of_lists)
 
cs

문제 확인

해결3)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import pymysql
from sqlalchemy import create_engine
# 리스트의 길이 확인 및 동일한 길이로 만들기
max_length = max(len(buildingRegister), len(registeredCopy), len(identification), len(powerOfAttorney), len(CertificateThatDoesNotExist))
 
buildingRegister.extend([None* (max_length - len(buildingRegister)))
registeredCopy.extend([None* (max_length - len(registeredCopy)))
identification.extend([None* (max_length - len(identification)))
powerOfAttorney.extend([None* (max_length - len(powerOfAttorney)))
CertificateThatDoesNotExist.extend([None* (max_length - len(CertificateThatDoesNotExist)))
 
# MySQL 연결 정보
HOSTNAME = 'project-db-stu.ddns.net'
PORT = 3307
DATABASE = 'smhrd_A_5'
USERNAME = 'smhrd_A_5'
PASSWORD = 'smhrd5'
 
 
# MySQL 연결 엔진 생성
CONNECTION_STRING = f'mysql+pymysql://{USERNAME}:{PASSWORD}@{HOSTNAME}:{PORT}/{DATABASE}?charset=utf8mb4'
engine = create_engine(CONNECTION_STRING, echo=False)
 
# 데이터프레임 생성
data = {'buildingRegister': buildingRegister, 'registeredCopy': registeredCopy,
        'identification': identification, 'powerOfAttorney': powerOfAttorney,
        'CertificateThatDoesNotExist': CertificateThatDoesNotExist}
df = pd.DataFrame(data)
 
# 데이터프레임을 MySQL에 저장
df.to_sql('newsSummary', con=engine, if_exists='replace', index=False)
 
cs

"All arrays must be of the same length"는 판다스 데이터프레임을 생성할 때 모든 배열의 길이가 동일해야 함을 의미합니다. 서로 다른 길이의 배열을 사용하여 데이터프레임을 생성하려고 하면 ValueError가 발생
모든 배열의 길이를 최대 길이로 맞추기 위해 결측값(None)을 추가 후 실행

 

문제 확인

해결4)

web_crawler(search_terms, 1, 2)

코드에서 발생하는 NameError는 'start_page'가 정의되지 않았기 때문에 발생합니다. 그러나 제공된 코드를 검토하면 'start_page'와 'end_page' 변수가 이미 정의되어 있습니다. 이 오류는 변수를 호출하는 방식에 관련문제