문제1)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | import re import requests from bs4 import BeautifulSoup import numpy as np # get_news_data() 함수는 동일하게 유지 titles, contents = get_news_data() # contents를 NumPy 배열로 변환 contents_array = np.array(contents) print("titlesbuildingRegister:", titles[0]) print("titlesregisteredCopy:", titles[1]) print("titlesidentification:", titles[2]) print("titlespowerOfAttorney:", titles[3]) print("titlesCertificateThatDoesNotExist:", titles[4]) print("contentsbuildingRegister:", contents_array[0]) print("contentsregisteredCopy:", contents_array[1]) print("contentsidentification:", contents_array[2]) print("contentspowerOfAttorney:", contents_array[3]) print("contentsCertificateThatDoesNotExist:", contents_array[4]) --------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[85], line 7 4 import numpy as np 6 # contents를 NumPy 배열로 변환 ----> 7 contents_array = np.array(contents) 9 print("titlesbuildingRegister:", titles[0]) 10 print("titlesregisteredCopy:", titles[1]) ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (5,) + inhomogeneous part. | cs |
문제2)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | import re import requests from bs4 import BeautifulSoup import numpy as np from konlpy.tag import Okt from textrank import KeysentenceSummarizer # get_news_data() 함수는 동일하게 유지 titles, contents = get_news_data() # contents의 모든 원소들의 길이를 가장 큰 원소와 동일하게 맞춤 max_len = max(len(c) for c in contents) contents_padded = [np.pad(c, (0, max_len - len(c)), 'constant') for c in contents] # contents_padded를 NumPy 배열로 변환 contents_array = np.array(contents_padded) # 토큰화 함수 정의 def okt_tokenizer(sent): okt = Okt() words = okt.nouns(sent) return words # KeysentenceSummarizer 생성 summarizer = KeysentenceSummarizer(tokenize=okt_tokenizer, min_sim=0.3, verbose=False) # 요약 생성 summaries = [] for content in contents_array: summary = summarizer.summarize([content], topk=3) summaries.append(summary) # 요약 결과 출력 for i, summary in enumerate(summaries): print(f"Summary for text {i + 1}:") for sentence in summary: print(sentence) print() --------------------------------------------------------------------------- AssertionError Traceback (most recent call last) Cell In[98], line 30 28 summaries = [] 29 for content in contents_array: ---> 30 summary = summarizer.summarize([content], topk=3) 31 summaries.append(summary) 34 # 요약 결과 출력 File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\textrank\summarizer.py:194, in KeysentenceSummarizer.summarize(self, sents, topk, bias) 191 elif bias is not None: 192 raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias))) --> 194 self.train_textrank(sents, bias) 195 idxs = self.R.argsort()[-topk:] 196 keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)] File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\textrank\summarizer.py:157, in KeysentenceSummarizer.train_textrank(self, sents, bias) 143 def train_textrank(self, sents, bias=None): 144 """ 145 Arguments 146 --------- (...) 155 None 156 """ --> 157 g = sent_graph(sents, self.tokenize, self.min_count, 158 self.min_sim, self.similarity, self.vocab_to_idx, self.verbose) 159 self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1) 160 if self.verbose: File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\textrank\sentence.py:42, in sent_graph(sents, tokenize, min_count, min_sim, similarity, vocab_to_idx, verbose) 14 """ 15 Arguments 16 --------- (...) 38 shape = (n sents, n sents) 39 """ 41 if vocab_to_idx is None: ---> 42 idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count) 43 else: 44 idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])] File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\textrank\utils.py:24, in scan_vocabulary(sents, tokenize, min_count) 6 def scan_vocabulary(sents, tokenize=None, min_count=2): 7 """ 8 Arguments 9 --------- (...) 22 Vocabulary to index mapper. 23 """ ---> 24 counter = Counter(w for sent in sents for w in tokenize(sent)) 25 counter = {w:c for w,c in counter.items() if c >= min_count} 26 idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])] File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\collections\__init__.py:552, in Counter.__init__(self, iterable, **kwds) 541 '''Create a new, empty Counter object. And if given, count elements 542 from an input iterable. Or, initialize the count from another mapping 543 of elements to their counts. (...) 549 550 ''' 551 super(Counter, self).__init__() --> 552 self.update(iterable, **kwds) File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\collections\__init__.py:637, in Counter.update(self, iterable, **kwds) 635 super(Counter, self).update(iterable) # fast path when counter is empty 636 else: --> 637 _count_elements(self, iterable) 638 if kwds: 639 self.update(kwds) File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\textrank\utils.py:24, in (.0) 6 def scan_vocabulary(sents, tokenize=None, min_count=2): 7 """ 8 Arguments 9 --------- (...) 22 Vocabulary to index mapper. 23 """ ---> 24 counter = Counter(w for sent in sents for w in tokenize(sent)) 25 counter = {w:c for w,c in counter.items() if c >= min_count} 26 idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])] Cell In[98], line 21, in okt_tokenizer(sent) 19 def okt_tokenizer(sent): 20 okt = Okt() ---> 21 words = okt.nouns(sent) 22 return words File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\konlpy\tag\_okt.py:83, in Okt.nouns(self, phrase) 80 def nouns(self, phrase): 81 """Noun extractor.""" ---> 83 tagged = self.pos(phrase) 84 return [s for s, t in tagged if t == 'Noun'] File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\konlpy\tag\_okt.py:69, in Okt.pos(self, phrase, norm, stem, join) 58 def pos(self, phrase, norm=False, stem=False, join=False): 59 """POS tagger. 60 In contrast to other classes in this subpackage, 61 this POS tagger doesn't have a `flatten` option, (...) 67 :param join: If True, returns joined sets of morph and tag. 68 """ ---> 69 validate_phrase_inputs(phrase) 71 tokens = self.jki.tokenize( 72 phrase, 73 jpype.java.lang.Boolean(norm), 74 jpype.java.lang.Boolean(stem)).toArray() 75 if join: File c:\Users\smhrd\anaconda3\envs\p38_yolov5\lib\site-packages\konlpy\tag\_common.py:20, in validate_phrase_inputs(phrase) 14 """validate if phrase input is provided in str format 15 16 Args: 17 phrase (str): phrase input 18 """ 19 msg = "phrase input should be string, not %s" % type(phrase) ---> 20 assert isinstance(phrase, basestring), msg AssertionError: phrase input should be string, not | cs |
문제 3)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | # MySQL 연결 정보 HOSTNAME = 'project-db-stu.ddns.net' PORT = 3307 DATABASE = 'smhrd_A_5' USERNAME = 'smhrd_A_5' PASSWORD = 'smhrd5' list_of_lists = [buildingRegister, registeredCopy, identification, powerOfAttorney, CertificateThatDoesNotExist] # MySQL 연결 엔진 생성 CONNECTION_STRING = f'mysql+pymysql://{USERNAME}:{PASSWORD}@{HOSTNAME}:{PORT}/{DATABASE}?charset=utf8mb4' engine = create_engine(CONNECTION_STRING, echo=False) #데이터프레임으로 변환 for i in range(list_of_lists): if buildingRegister == list_of_lists: buildingRegister = pd.DataFrame({'buildingRegister': results}, index=[f"file_{i}" for i in buildingRegister]) buildingRegister.to_sql('newsSummary', con=engine, if_exists='replace',index=False) elif registeredCopy == list_of_lists: registeredCopy = pd.DataFrame({'identification': results}, index=[f"file_{i}" for i in registeredCopy]) registeredCopy.to_sql('newsSummary', con=engine, if_exists='replace',index=False) elif identification == list_of_lists: identification = pd.DataFrame({'identification': results}, index=[f"file_{i}" for i in identification]) identification.to_sql('newsSummary', con=engine, if_exists='replace',index=False) elif powerOfAttorney == list_of_lists: powerOfAttorney = pd.DataFrame({'identification': results}, index=[f"file_{i}" for i in powerOfAttorney]) powerOfAttorney.to_sql('newsSummary', con=engine, if_exists='replace',index=False) elif CertificateThatDoesNotExist == list_of_lists: CertificateThatDoesNotExist = pd.DataFrame({'identification': results}, index=[f"file_{i}" for i in CertificateThatDoesNotExist]) CertificateThatDoesNotExist.to_sql('newsSummary', con=engine, if_exists='replace',index=False) Cell In[201], line 15 if buildingRegister == list_of_lists: ^ IndentationError: expected an indented block | cs |
들여쓰기 오류
문제4)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | def web_crawler(search_terms, start_page, end_page): # 크롤링시 필요한 라이브러리 불러오기 from bs4 import BeautifulSoup import requests import time import schedule from selenium import webdriver from selenium.webdriver.common.by import By from webdriver_manager.chrome import ChromeDriverManager import urllib.parse # 웹드라이버 설정 options = webdriver.ChromeOptions() options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) # 각 검색어에 대한 결과를 저장할 리스트 생성 buildingRegister = [] registeredCopy = [] identification = [] powerOfAttorney = [] CertificateThatDoesNotExist = [] search_results = [] search_results_dict = {} def makeUrl(search, page, page2): search_urls = [] base_url = "https://search.naver.com/search.naver?where=news&query=" search = urllib.parse.quote(search) for p in range(page, page2 + 1): start = (p - 1) * 10 + 1 search_url = base_url + search + "&start=" + str(start) search_urls.append(search_url) return search_urls #(부동산 건축물대장 사기피해,부동산 등기부등본 사기피해, 부동산 신분증 사기피해,부동산 위임장 사기피해, 후견등기사항 부존재증명서 피해) #검색어 입력 search_terms = [ "부동산 건축물대장 사기피해", "부동산 등기부등본 사기피해", "부동산 신분증 사기피해", "부동산 위임장 사기피해", "후견등기사항 부존재증명서 피해", ] start_page = 1 end_page = 2 def job(search): #검색 시작할 페이지 입력 page = int(1) #page = int(input("\n크롤링할 시작 페이지를 입력해주세요. ex)1(숫자만입력):")) print("\n크롤링할 시작 페이지: ",page,"페이지") #검색 종료할 페이지 입력 page2 = int(2) #page2 = int(input("\n크롤링할 종료 페이지를 입력해주세요. ex)1(숫자만입력):")) print("\n크롤링할 종료 페이지: ",page2,"페이지") # naver url 생성 search_urls = makeUrl(search, page, page2) ## selenium으로 navernews만 뽑아오기## # 버전에 상관 없이 os에 설치된 크롬 브라우저 사용 driver = webdriver.Chrome(ChromeDriverManager().install()) driver.implicitly_wait(3) # selenium으로 검색 페이지 불러오기 # naver_urls=[] for i in search_urls: driver.get(i) time.sleep(1) #대기시간 변경 가능 # 네이버 기사 눌러서 제목 및 본문 가져오기# # 네이버 기사가 있는 기사 css selector 모아오기 a = driver.find_elements(By.CSS_SELECTOR,'a.info') # 위에서 생성한 css selector list 하나씩 클릭하여 본문 url얻기 for i in a: i.click() # 현재탭에 접근 driver.switch_to.window(driver.window_handles[1]) time.sleep(3) #대기시간 변경 가능 # 네이버 뉴스 url만 가져오기 url = driver.current_url print(url) if "news.naver.com" in url: naver_urls.append(url) else: pass # 정보 크롤링하는 코드 # 현재 탭 닫기 driver.close() # 다시처음 탭으로 돌아가기(매우 중요!!!) driver.switch_to.window(driver.window_handles[0]) search_results_dict[search] = naver_urls # 'job' 함수를 처음 호출하고 스케줄 설정 for search in search_terms: job(search) web_crawler(search_terms, start_page, end_page) --------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[262], line 1 ----> 1 web_crawler(search_terms, start_page, end_page) NameError: name 'start_page' is not defined | cs |
해결1)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | import re import requests from bs4 import BeautifulSoup import numpy as np # get_news_data() 함수는 동일하게 유지 titles, contents = get_news_data() # contents의 모든 원소들의 길이를 가장 큰 원소와 동일하게 맞춤 max_len = max(len(c) for c in contents) contents_padded = [np.pad(c, (0, max_len - len(c)), 'constant') for c in contents] # contents_padded를 NumPy 배열로 변환 contents_array = np.array(contents_padded) print("contentsbuildingRegister:", contents_array[0]) print("contentsregisteredCopy:", contents_array[1]) print("contentsidentification:", contents_array[2]) print("contentspowerOfAttorney:", contents_array[3]) print("contentsCertificateThatDoesNotExist:", contents_array[4]) | cs |
NumPy 배열은 모든 원소의 타입이 동일해야 하므로, 서로 다른 길이의 문자열을 원소로 가진 리스트를 배열로 변환하면 ValueError가 발생합니다. 따라서 contents 리스트의 원소들의 길이를 모두 동일하게 맞춰줘야 합니다. 길이가 가장 큰 문자열을 기준으로 모든 문자열을 동일한 길이로 맞추고, 이후 배열로 변환하면 됩니다.
해결2)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | import numpy as np from konlpy.tag import Okt from textrank import KeysentenceSummarizer def summarize_contents(contents_list_of_lists): # NumPy 배열로 변환 results_array = [np.array(contents) for contents in contents_list_of_lists] # 요약 생성 all_summaries = [] for contents in results_array: summaries = [] for content in contents: try: summary = summarize(content, ratio=0.5) # summarize 라이브러리를 사용하여 요약 생성 summary = summary[:300] # 요약 결과를 300자로 자르기 summaries.append(summary) except Exception as e: print(f"Error summarizing text: {e}") summaries.append(None) all_summaries.append(summaries) buildingRegister, registeredCopy, identification, powerOfAttorney, CertificateThatDoesNotExist = all_summaries # 요약 결과 출력 for i, summaries in enumerate(all_summaries): print(f"Summaries for contents_list_of_lists {i + 1}:") for j, summary in enumerate(summaries): if summary: print(f" Gensim Summary for text {j + 1}:") print(f" {summary}") print() return buildingRegister, registeredCopy, identification, powerOfAttorney, CertificateThatDoesNotExist # 예제 실행 buildingRegister, registeredCopy, identification, powerOfAttorney, CertificateThatDoesNotExist = summarize_contents(contents_list_of_lists) | cs |
해결3)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | import pymysql from sqlalchemy import create_engine # 리스트의 길이 확인 및 동일한 길이로 만들기 max_length = max(len(buildingRegister), len(registeredCopy), len(identification), len(powerOfAttorney), len(CertificateThatDoesNotExist)) buildingRegister.extend([None] * (max_length - len(buildingRegister))) registeredCopy.extend([None] * (max_length - len(registeredCopy))) identification.extend([None] * (max_length - len(identification))) powerOfAttorney.extend([None] * (max_length - len(powerOfAttorney))) CertificateThatDoesNotExist.extend([None] * (max_length - len(CertificateThatDoesNotExist))) # MySQL 연결 정보 HOSTNAME = 'project-db-stu.ddns.net' PORT = 3307 DATABASE = 'smhrd_A_5' USERNAME = 'smhrd_A_5' PASSWORD = 'smhrd5' # MySQL 연결 엔진 생성 CONNECTION_STRING = f'mysql+pymysql://{USERNAME}:{PASSWORD}@{HOSTNAME}:{PORT}/{DATABASE}?charset=utf8mb4' engine = create_engine(CONNECTION_STRING, echo=False) # 데이터프레임 생성 data = {'buildingRegister': buildingRegister, 'registeredCopy': registeredCopy, 'identification': identification, 'powerOfAttorney': powerOfAttorney, 'CertificateThatDoesNotExist': CertificateThatDoesNotExist} df = pd.DataFrame(data) # 데이터프레임을 MySQL에 저장 df.to_sql('newsSummary', con=engine, if_exists='replace', index=False) | cs |
"All arrays must be of the same length"는 판다스 데이터프레임을 생성할 때 모든 배열의 길이가 동일해야 함을 의미합니다. 서로 다른 길이의 배열을 사용하여 데이터프레임을 생성하려고 하면 ValueError가 발생
모든 배열의 길이를 최대 길이로 맞추기 위해 결측값(None)을 추가 후 실행
해결4)
web_crawler(search_terms, 1, 2)코드에서 발생하는 NameError는 'start_page'가 정의되지 않았기 때문에 발생합니다. 그러나 제공된 코드를 검토하면 'start_page'와 'end_page' 변수가 이미 정의되어 있습니다. 이 오류는 변수를 호출하는 방식에 관련문제
'일기' 카테고리의 다른 글
프로젝트일기) 플라스크&리엑트연동 구현 (1) | 2023.04.13 |
---|---|
프로젝트 일기 뉴스 크롤링 통합, 리엑트 (0) | 2023.04.11 |
팀프로젝트) 부동산 관련뉴스 크롤링 + 요약 + 주의사항 피드백 (0) | 2023.04.07 |
2차 팀프로젝트 뉴스 gpt api 활용 (0) | 2023.04.06 |
2차 팀프로젝트 뉴스 이슈관련 (0) | 2023.04.05 |