Generation

generate functionWed, 08 Feb 2023

def get_needed_posts(query): site = pd.DataFrame() for q in query: URL = parseurl+'search/' params = { 'q': q } req = requests.get(URL, params=params) time.sleep(0.3) soup = BeautifulSoup(req.text) articles = soup.find_all('article', class_='tm-articles-list__item') for article in articles: try: title = article.find('h2', class_='tm-article').text date = article.find('span', class_='tm-article').text.strip() link = article.find('h2', class_='tm-article').find('a').get('href') except: pass row = {'date': date, 'title': title, 'link': 'https://habr.com'+link} site = pd.concat([site, pd.DataFrame([row])]) return site.reset_index(drop=True) res = get_needed_posts(query) res Modify the Python function above so that if it finds a repeated title or link, it will not add it to the return

def get_needed_posts(query): site = pd.DataFrame() for q in query: URL = parseurl+'search/' params = { 'q': q } req = requests.get(URL, params=params) time.sleep(0.3) soup = BeautifulSoup(req.text) articles = soup.find_all('article', class_='tm-articles-list__item') for article in articles: try: title = article.find('h2', class_='tm-article').text date = article.find('span', class_='tm-article').text.strip() link = article.find('h2', class_='tm-article').find('a').get('href') if title not in site.title.values and link not in site.link.values: row = {'date': date, 'title': title, 'link': 'https://habr.com'+link} site = pd.concat([site

Questions about programming?Chat with your personal AI assistant