Generation

generate functionWed, 08 Feb 2023

def get_needed_posts(query): for article in articles: try: title = article.find('h2', class_='tm-article').text date = article.find('span', class_='tm-article').text.strip() link = article.find('h2', class_='tm-article').find('a').get('href') except: pass row = {'date': date, 'title': title, 'link': 'https://habr.com'+link} site = pd.concat([site, pd.DataFrame([row])]) return site.reset_index(drop=True) Modify the Python function above so that if it finds a repeated title or link, it will not add it to the return

def get_needed_posts(query): site = pd.DataFrame(columns=['date', 'title', 'link']) articles = BeautifulSoup(requests.get(f'https://habr.com/ru/search/?target_type=posts&q={query}').text, 'lxml').find('div', class_='content-list').find_all('article', class_='post post_preview') for article in articles: try: title = article.find('h2', class_='post__title').text date = article.find('span', class_='post__time').text.strip() link = article.find('h2', class_='post__title').find('a').get('href') except: pass row = {'date': date, 'title': title, 'link': 'https://habr.com'+link} site = pd.concat([site, pd.DataFrame([row])]) return site.reset_index(drop=True)

Questions about programming?Chat with your personal AI assistant