class Designer(Employee):
def __init__(self, name, seniority, awards=2):
super().__init__(name, seniority)
self.intlawards = awards
def check_if_it_is_time_for_upgrade(self):
# for each accreditation, increase the counter by 1
# for now we assume that all designers are accredited
self.seniority += 1
# condition for promoting an employee from the presentation
if (self.seniority) + (self.intlawards*2) % 7 == 0:
self.grade_up()
# publication of the results
return self.publish_grade()
def find_duplicates(lst):
new_lst = []
for i in lst:
if i[0] not in new_lst:
new_lst.append(i[0])
else:
i[0] = "9090"
return new_lst
find_duplicates(lst)
df_new['incident_type'].value_counts()[df_new['incident_type'].value_counts() >= 5].sort_values(ascending = False)
def checkio(str):
cnt = 0
for i in str.split():
cnt = cnt + 1 if i.isalpha() else 0
if cnt >= 3:
return True
return False
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv('data.csv')
df.sort_values('perc_of_5star', inplace=True)
df.reset_index(inplace=True)
plt.hlines(y=df.decade, xmin=0, xmax=df.perc_of_5star, color='skyblue')
plt.gca().invert_yaxis()
plt.show()
import numpy as np
import pandas as pd
df=pd.DataFrame(data={'rate_group':['A%','B%', 'C']})
df['rate_group'] = df['rate_group'].str.replace('%', '')
df['rate_group']=df['rate_group'].replace(r'^\s*$', np.nan, regex=True)
print(df['rate_group'])
def validate_usr(username):
import re
if len(username) < 4 or len(username) > 16:
return False
else:
return bool(re.match("^[a-z0-9_]*$", username))
import pandas as pd
sl = [0.05, 0.4, 0.5, 0.95]
sw = [0.7, 0.8, 0.3, 0.9]
data = {"sl": sl, "sw": sw}
def name_list(group, dct):
lst = []
for key, value in dct.items():
if value[-1] == group:
lst.append(value[0] + ' ' + value[1] + ' ' + value[2])
lst.sort()
for i in range(len(lst)):
print('{}. {}'.format(i + 1, lst[i]))
name_list('BST161', dct)
def replace(students, stnums):
for student in students:
if student[0] in stnums:
student[0] = replacing_num
return students
from math import sqrt
def conf_interval(n, mean, sig, conf):
z_value = abs(conf - mean)
sq_n = sqrt(n)
interval = z_value / sq_n
return interval
conf_interval(3, 1, 2, 1)
# Solution 1
df.groupby(['userId'])['rating'].sum().sort_values(ascending = False).head()
# Solution 2
df.groupby(['userId'])['rating'].sum().sort_values(ascending = False).head().reset_index()
def goes_after(word, first, second):
for i in range(len(word) - 1):
if word[i] == first and word[i+1] == second:
return True
return False
import numpy as np
def system_solver(a, b, c, d, e, f):
return np.linalg.solve(a, b)
a = np.array([[4, 2, 1], [1, 3, 0], [0, 5, 4]])
b = np.array([4, 12, -3])
system_solver(a, b)
def fill_na(df, col_name='rectal_temp', range_thresh=0.2):
col_vals = df[col_name]
col_vals_notnull_ind = col_vals.notnull()
col_vals_notnull_ind = col_vals_notnull_ind.values[:, np.newaxis]
col_vals_notnull = col_vals.loc[col_vals_notnull_ind].values
col_vals_notnull_idx = col_vals_notnull_ind.nonzero()[0]
for i in range(0, col_vals.shape[0]):
if not col_vals_notnull_ind[i]:
for j in range(0, col_vals_notnull.shape[0]):
if (col_vals_notnull[j] - col_vals_notnull[j] * range_thresh) < col_vals[i] < (col_vals_notnull[j] + col_vals_notnull[j] * range_
def convert_to_dict(students):
students_dict = {}
for student in students:
students_dict[student[0]] = student[1:]
return students_dict
convert_to_dict(students)
def group_by_rating(df):
grouped = df.groupby('decade')
result = {}
for decade, group in grouped:
result[decade] = (group.rating == 5.0).sum() / len(group)
return result
def max_value(dct):
value = 0
for k, v in dct.items():
if v["Value"] > value:
key = k
value = v["Value"]
return key
max_value(dct)
def find_non_numbers(data, column):
return data[~pd.to_numeric(data[column], errors='coerce').notnull()]
#OR
import re
def find_non_numbers(data, column):
return data[~data[column].map(lambda x: bool(re.search(r'[a-zA-Z]', x)))]
def replace_sec(stnums, students, replacing_num):
for stnum in stnums:
num = 0
for student in students:
if stnum == student[0]:
num += 1
if num > 1:
student[0] = replacing_num
return students
replace_sec(stnums, students, replacing_num)
#function that: There is a Pandas dataframe
import pandas as pd
def merge_df(df):
df.drop_duplicates(subset = ["name"],
keep = 'first', inplace = True)
return df.sort_values(by = 'name')
df = pd.read_csv('test_data.csv')
merge_df(df)
def hotpo(n):
steps = 0
while n > 1:
if n % 2 == 0:
n /= 2
else:
n = 3 * n + 1
steps += 1
return steps
def add_777999_555333_11110_77_7_00_0(a):
if a == 777:
return 200
elif a == 999:
return 100
elif a == 555:
return 50
elif a == 333:
return 15
elif a == 111:
return 10
elif a%10 == 7 and a//10%10 == 7:
return 5
elif a%10 == 7:
return 3
elif a//10%10 == 0 and a//100%10 == 0:
return 2
elif a//10%10 == 0:
return 1
else:
return 0
def find_score(num_trials):
money = 0
successes = 0
for i in range(num_trials):
money += add_777999_555333_11110_77_7_00_0(np.random.randint(1, 1000))
successes += 1
return money, successes
def sale_hotdogs(n):
return (n < 5) * (n * 100) + (n >= 5 and n < 10) * (n * 95) + (n >= 10) * (n * 90)
def add(a, b):
return a + b
add(1, 2)
def guess_blue(blue_start, red_start, blue_pulled, red_pulled):
return (blue_start - blue_pulled) / (blue_start - blue_pulled + red_start - red_pulled)
def get_needed_posts(query):
site = pd.DataFrame(columns=['date', 'title', 'link'])
for q in query:
URL = parseurl+'search/'
params = {
'q': q
}
req = requests.get(URL, params=params)
soup = BeautifulSoup(req.text)
articles = soup.find_all('article', class_='tm-articles')
for article in articles:
try:
title = article.find('h2', class_='tm-article').text
date = article.find('span', class_='tm-article').text.strip()
link = article.find('h2', class_='tm-article').find('a').get('href')
except:
pass
if link not in site.link.values:
row = {'date': date, 'title': title, 'link': 'https://habr.com'+link}
site = pd.concat([site, pd
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
gdpdiff = gdpdiff.dropna(subset=['GDP per capita'])
gdpdiff = gdpdiff[gdpdiff['GDP per capita'] > 0]
gdpdiff = gdpdiff.sort_values(by='GDP per capita')
gdpdiff['Country or region'].replace({
'United States': 'USA',
'United Kingdom': 'UK'
})
gdpdiff.plot(x='Country or region', y='GDP per capita', kind='bar')
import plotly.express as px
fig = px.bar(y = label, x = counts)
fig.show()
import pandas as pd
ratings = pd.read_csv('ratings.csv')
print(ratings.head())
# Create a function which return the average lifetime of users
def lifetime(group):
return group.max() - group.min()
lifetime_users = ratings.groupby('userId').agg(lifetime)
print(lifetime_users)
average_lifetime = lifetime_users['timestamp'].mean()
print(average_lifetime)
# output:
# userId movieId rating timestamp
# 0 1 31 2.5 1260759144
# 1 1 1029 3.0 1260759179
# 2 1 1061 3.0 1260759182
# 3 1 1129 2.0 1260759185
# 4 1 1172 4.0 1260759205
# timestamp
# userId
# 1 203560
# 2 866607
# 3 8
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv(filename)
df.plot(kind='bar',x='performer',y='number_of_hits')
plt.show()
import pandas as pd
df = pd.read_csv('artist_song_chart_debut.csv')
df['chart_debut'] = df['chart_debut'].apply(lambda x: str(x)[:4])
df
df_hot_years = df_ru[df_ru["av_temp"] > 12]["year"] "
print(df_hot_years)
def sum_of_differences(arr):
if len(arr) <= 1:
return 0
arr.sort(reverse=True)
return sum(arr[i] - arr[i + 1] for i in range(len(arr) - 1))
def check_sample_size(conf = 0.95, delta = 0.05, sigsqr = 225):
"""
confidence level - уровень доверия
delta - погрешность
sigsqr - дисперсия
"""
p = 1 - ((1 - conf) / 2)
z = norm.ppf(p)
n = (z ** 2) * sigsqr / delta ** 2
return int(n)
print(check_sample_size())
import re
def rate_group(int_rate):
int_rate = re.sub('%', '', int_rate)
int_rate = float(int_rate)
if int_rate > 15.0:
return '>15'
elif 10.0 < int_rate <= 15.0:
return '10-15'
else:
return '<10'
df['rate_group'] = df['int_rate'].apply(rate_group)
def interval(n, mean, sig, conf):
h = sig/(n**(1/2))*z
return round(h*2)
fig, ax = plt.subplots()
df.plot(kind='scatter',
x='User_Score',
y='Global_Sales',
ax=ax)
df.plot(kind='scatter',
x='Critic_Score',
y='Global_Sales',
ax=ax)
plt.ylim(0, 40)
df.groupby(level=0).cumcount()
def function(x, y, z):
return x + y
function(1, 2)
def glue_list(lst):
glue_string = ''
for i in lst:
glue_string += str(i)
return glue_string
print(glue_list([0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1]))
def money_add(lst):
for i in lst:
if i >= 100:
money += 1
def date_range(start_date, end_date):
lst = []
if start_date > end_date:
return []
else:
try:
start = (dt.strptime(start_date, '%Y-%m-%d'))
end = (dt.strptime(end_date, '%Y-%m-%d'))
while start <= end:
lst.append(start.strftime('%Y-%m-%d'))
start += td(days=1)
return lst
except:
return []
date_range('2022-12-01', '2022-12-05')
def group_list(dct, gr):
group_list = []
for k, v in dct.items():
if gr in v:
group_list.append(' '.join(dct[k][0:3]))
group_list.sort()
return group_list
group_list(dct, 'BST161')
def bar_plot(question_text, title, y_title, name):
question = df[question_text].value_counts()
label = question.index
counts = question.values
fig = px.bar(x=counts, y=label, orientation='h')
fig.update_layout(title_text=title)
fig.update_yaxes(title_text=y_title)
fig.update_xaxes(title_text=name)
fig.show()
question_text = 'What is the most preferred working environment for you.'
title = 'Какая рабочая среда для вас наиболее предпочтительна?'
y_title = 'Ответы'
name = 'Количество ответов'
bar_plot(question_
def year_leaders(df):
df = df.copy()
df.drop(columns=['time_on_chart', 'consecutive_weeks', 'decade', 'num_of_hits'], inplace=True)
years = df.chart_debut.unique()
year_leaders_df = pd.DataFrame()
for year in years:
year_df = df[df.chart_debut == year]
year_max_row = year_df.loc[year_df.num_of_hits.idxmax()]
year_leaders_df = year_leaders_df.append(year_max_row, ignore_index=True)
return year_leaders_df
for i in winnums:
print(i)
print(i +=1)
paid['App'].groupby(paid['Type']).sum()\
.plot(kind='pie',
figsize=(5, 6),
autopct='%1.1f%%',
startangle=90,
title='Процент бесплатных и платных приложений',
legend=True)
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
return dfp_copy
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
df_scaled = std_scaler.fit_transform(df)
print(df_scaled)
def total_ingredients(cook_book):
dish = 'salad'
portions = 5
grams = cook_book['quantity'] * portions
if key == 'salad':
grams = 'quantity' * portions
print(grams)
cook_book = {'salad': [{'ingridient_name': 'cheese', 'quantity': 50, 'measure': 'g'},
{'ingridient_name': 'tomatoes', 'quantity': 2, 'measure': 'pcs'},
{'ingridient_name': 'cucumbers', 'quantity': 100, 'measure': 'g'}]}
total_ingredients(cook_book)
from datetime import datetime as dt
from datetime import timedelta as td
def date_range(start_date, end_date):
lst = []
if start_date > end_date:
return []
else:
try:
start = (dt.strptime(start_date, '%Y-%m-%d'))
end = (dt.strptime(end_date, '%Y-%m-%d'))
interval = end - start
for i in range(interval.days + 1):
lst.append(start.strftime('%Y-%m-%d'))
start = start + td(days=1)
return lst
except:
return 'Incorrect date format'
date_range('2022-12-01', '2022-12-20')
def rate_group(value):
if value > 15.00:
return '>15'
elif 10.00 <= value <= 15.00:
return '10-15'
else:
return '<10'
df['rate_group'] = df['int_rate'].apply(rate_group)
df
def max_dct(dct, key):
return max(dct, key=lambda key: dct[key]['Value'])
dct = {
'value1': {'Value': 3},
'value2': {'Value': 1},
'value3': {'Value': 2},
}
max_dct(dct, 'Value') # returns 'value1'
import math
def sample_size(delta, sigsqr, conf):
return math.ceil( (sigsqr * (math.log(1 / (1 - conf)) / delta)**2) / 2 )
sample_size(0.05, 0.02, 0.95)
def year_leaders(df):
df = df.groupby('chart_debut').apply(lambda x: x.nlargest(1, 'num_of_hits'))
df = df.reset_index()
df = df.drop(columns = ['level_1'])
return df
from sklearn.preprocessing import StandardScaler
def standardize(df):
return df.apply(StandardScaler().fit_transform)
standardize(df)
def years():
return list(range(1950, 2011))
years()
def multiply_dict(key):
grams = 0
cook_book = {'salad': [{'ingridient_name': 'cheese', 'quantity': 50, 'measure': 'gr'},
{'ingridient_name': 'tomatoes', 'quantity': 2, 'measure': 'pct'},
{'ingridient_name': 'pepper', 'quantity': 20, 'measure': 'гр'}]}
if key == 'salad':
grams = cook_book.get('salad')[0].get('quantity') * 5
print(grams)
multiply_dict('salad')
def replace_stnum(stnums, students, replacing_num):
for student in students:
for stnum in stnums:
if stnum in student:
student[0] = replacing_num
return students
print(replace_stnum(stnums, students, replacing_num))
import pandas as pd
# create a new dataframe
dfs = df[['song','performer','chart_debut','peak_position','worst_position','time_on_chart','consecutive_weeks']]
# convert the chart_debut to string
dfs['chart_debut'] = dfs['chart_debut'].astype(str)
# get the date
date = dfs.chart_debut.str.split("-", n = 1, expand = True)
# drop the chart_debut
dfs.drop(columns =['chart_debut'], inplace = True)
# insert the date
dfs.insert(2,'chart_debut',date[0])
# show the top 5 rows
print(dfs.head())
def df_to_float(df):
try:
df = df.astype(float)
return df
except:
pass
def cook_book():
portions = 5
for key, value in cook_book.items():
for sub_dict in value:
for v in sub_dict.values():
ingridient_name = v[0]
quantity = v[1]
measure = v[2]
grams = quantity * portions
print(grams)
return grams
def line_graph(dataframe):
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.plot(dataframe.columns, dataframe.iloc[0])
ax.set_xlabel('years')
ax.set_ylabel('columns')
ax.set_title('line graph')
df['profit_perc'] = df['profit'] / df['revenue']
def change_shelf(data, docnum, shelf):
if not shelf in data:
return'ERROR NO SUCH KEY'
for val in data.values():
if docnum in val:
val.append(docnum)
print('OK')
return data
return 'ERROR NO SUCH VALUE'
directories = {
'1': ['2207 876234', '11-2'],
'2': ['10006'],
'3': []
}
def compare_gdp_and_happiness(df1, df2):
df_merge = df1.merge(df2, how='inner', left_on='Country or region', right_on='entity')
df_merge = df_merge[['Country or region', 'GDP per capita', 'happinessScore']]
df_merge['GDP per capita'] = df_merge['GDP per capita'].apply(pd.to_numeric)
df_merge['happinessScore'] = df_merge['happinessScore'].apply(pd.to_numeric)
df_merge_sort = df_merge.sort_values(by='happinessScore', ascending=False)
#df_merge_sort
df_merge_top_1 = df_merge_sort.head(20)[0:1]
df_merge_top_1.rename(columns={'Country or region': 'Top1', 'GDP per capita': 'Top1 GDP', 'happinessScore
def guessBlue(blue_start, red_start, blue_pulled, red_pulled):
return (blue_start - blue_pulled)/(blue_start - blue_pulled + red_start - red_pulled)
def get_year(date):
return int(date[:4])
def get_month(date):
return int(date[5:7])
def get_day(date):
return int(date[8:])
df['year'] = df['date'].apply(get_year)
df['month'] = df['date'].apply(get_month)
df['day'] = df['date'].apply(get_day)
df[df['peak_position'] == 1].drop_duplicates(subset = 'song', keep = 'first')
def try_to_int(df):
for col in df.columns:
try:
df[col] = df[col].astype('Int64')
except:
pass
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame( {'place': [1, 2, 3, 4, 5],
'decade': ['2010-2020', '1900-1910', '1970-1980', '2000-2010', '1960-1970'],
'perc_of_5star': [2.3, 1.379, 1.179, 1.176, 1.133]})
df.sort_values(by='perc_of_5star', ascending=True).plot(kind='barh', x='decade', y='perc_of_5star', legend=False)
plt.show()
import pandas as pd
import numpy as np
df = pd.DataFrame({'name': ['Taylor Swift', 'Imagine Dragons', 'Ed Sheeran'],
'year': [2008, 2012, 2014]})
def decade_equals(year):
return year // 10 * 10
df.groupby(decade_equals)['name'].apply(list)
stnums = ['4004']
students = [
['0001', 'Antonov', 'Anton', 'Igorevich', '20.08.2009', 'BST161'],
["1102", "Bogov", "Artem", "Igorevich", "25.01.2010", "BST162"]
["0333", "Glagoleva", "Anastasiya", "Nikolaevna", "11.07.2009", "BST163"]
["4004", "Stepanova", "Natalia", "Aleksandrovna", "13.02.2008", "BST161"]
["0045", "Bokov", "Igor", "Kharitonovich", "02.06.2009", "BST161"],
["0096", "Vasil'kov", "Valentin", "Sergeevich", "20.03.2009", "BST164"],
["0607", "Siropova", "Violetta", "Eduardovna", "28.05.2010", "BST162"],
["4004
def split_func(a):
import json
import re
a = json.loads(a.strip())
purchases = {}
for i, line in enumerate(a):
line = re.split(r",|:", str(line))
keys = line[1]
values = line[3]
purchases[keys] = values
return purchases
split_func(a)
def multiple_of_index(arr):
return [num for i, num in enumerate(arr) if num % i == 0 and i != 0]
def value_counts(df, year):
if df[df['rating'] == 5.0]:
return df["year"].value_counts()
else:
return "no movie with 5.0 rating in this year"
import pandas as pd
news = pd.read_csv('https://raw.githubusercontent.com/ml-mipt/ml-mipt/basic/homeworks/Lab1_python_basics/news_sample.csv')
news[news['news_title'].str.contains('[0-9]{8}-[a-z]+')]
# First solution
def transpose(matrix):
new_matrix = [[] for _ in range(len(matrix[0]))]
for i in range(len(matrix[0])):
for j in range(len(matrix)):
new_matrix[i].append(matrix[j][i])
return new_matrix
def NAWE(country):
NA = ['Canada', 'United States']
WE = ['United Kingdom', 'Germany', 'Netherlands']
if country in NA or country in WE:
return False
return True
df = df[df['birth_country'].apply(NAWE)]
def guessBlue(blueStart, redStart, bluePulled, redPulled):
return (blueStart - bluePulled) / (blueStart - bluePulled + redStart - redPulled)
guessBlue(5, 5, 2, 3)
def load_json(line):
import json
from json.decoder import JSONDecodeError
try:
line = json.loads(line)
except JSONDecodeError:
line = {}
return line
f = open('purchase_log.txt', encoding='utf-8')
purchases = {}
for i, line in enumerate(f):
line = line.strip().split(',')
keys = line[1]
values = line[3]
purchases[keys] = values
def rate_group(x):
if x > '15.00%':
return '>15'
if (x >= '10.00%') & (x <= '15.00%'):
return '10-15'
df['rate_group'] = df['int_rate'].apply(lambda x: rate_group(x))
def find_difference(a, b):
return abs(reduce(lambda x,y: x*y, a) - reduce(lambda x,y: x*y, b))
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
return dfp_copy
df_new.groupby(['route', 'incident_type']).size().sort_values(ascending=False).head()
def group_by_performer(df):
# make a copy of df
df_copy = df.copy()
# sort by performer
df_copy.sort_values(by='performer', inplace=True)
# group by song and take the unique performers
df_copy['hits'] = df_copy.groupby('song')['performer'].transform(lambda x: ','.join(x.unique()))
# drop duplicates and reset index
df_copy.drop_duplicates(subset='performer', inplace=True)
df_copy.reset_index(inplace=True)
return df_copy
group_by_performer(df)
def total_ingredients(cook_book):
dish = 'salad'
portions = 5
grams = cook_book['quantity'] * portions
if key == 'salad':
grams = 'quantity' * portions
print(grams)
cook_book = {'salad': [{'ingridient_name': 'cheese', 'quantity': 50, 'measure': 'g'},
{'ingridient_name': 'tomatoes', 'quantity': 2, 'measure': 'pcs'},
{'ingridient_name': 'cucumbers', 'quantity': 100, 'measure': 'g'}]}
total_ingredients(cook_book)
import numpy as np
def sum_matrix(N):
my_matrix = np.diag(np.arange(N, 0, -1))
return my_matrix.sum()
sum_matrix(5)
sum_matrix(10)
sum_matrix(15)
def create_scaled_data(sl, sw):
scaled_data_1 = (sl - min(sl))/(max(sl) - min(sl))
scaled_data_2 = (sw - min(sw))/(max(sw) - min(sw))
scaled_data = pd.DataFrame({'sl': scaled_data_1, 'sw': scaled_data_2})
return scaled_data
scaled_data = create_scaled_data(sl, sw)
question6 = 'What is the most preferred working environment for you.'
question6 = df[question6].value_counts()
label = question6.index
counts = question6.values
fig = px.bar(x=label, y=counts, orientation='h')
fig.update_layout(title_text='Какая рабочая среда для вас наиболее предпочтительна?')
fig.show()
df.sort_values('name', ascending=False)
def sort_df(df):
return df.groupby('decade').rating.value_counts().sort_index()
def add(arr):
res = 0
for num in arr:
res += num
return res
add([2, 1, 10, 5])
def vowel_2_index(inp):
for i in range(len(inp)):
if type(inp[i]) == int:
if inp[i] == 97:
inp[i] = "a"
elif inp[i] == 118:
inp[i] = "u"
elif inp[i] == 105:
inp[i] = "i"
elif inp[i] == 111:
inp[i] = "o"
elif inp[i] == 101:
inp[i] = "e"
return inp
inp = [118, "u",120,121,"u",98,122,"a",120,106,104,116,113,114,113,120,106 ]
vowel_2_index(inp)
#df_new.route.value_counts()
def df_count(dataframe, column):
return dataframe[column].value_counts()
df_count(df_new, 'route')
def get_ingredients(dish, portions):
recipes = cook_book.get(dish, [])
if not recipes:
print('No such dish')
return
print(f'You need to cook {dish} for {portions} portions')
for recipe in recipes:
print(f'{recipe["ingridient_name"]}: {recipe["quantity"] * portions} {recipe["measure"]}')
def remove_percent(row):
if row.endswith("%"):
return float(row.replace("%", ""))
else:
return float(row)
df.rate_group = df.rate_group.map(remove_percent)
import pandas
df.sort_values(by=['time_on_chart', 'max'], ascending=False).head(20)
import pandas as pd
df = pd.DataFrame(data, columns=['performer', 'hits', 'chart_debut'])
def divide_decade(df, decade):
return df[(df['chart_debut'] >= int(decade[0:4])) & (df['chart_debut'] <= int(decade[5:9]))]
divide_decade(df, '1980-1990')['performer']
def goes_after(word, first, second):
for i in range(len(word)):
if word[i] == first:
if word[i+1] == second:
return True
else:
return False
else:
return False
import numpy as np
def create_array(N):
return np.arange(N-1, -1, -1)
create_array(10)
def get_movie_rating():
df = pd.read_csv(MOVIE_DATA_FILE_PATH)
df2 = df.groupby(['title'])['rating'].mean().round(1)
return df2
get_movie_rating()
def search_and_replace(lst, new_id):
for i in lst:
if i[0] == new_id:
i[0] = '9090'
print(lst)
search_and_replace(lst, '4004')
from math import sqrt
def distance(a, b):
return sqrt((a.x - b.x)**2 + (a.y - b.y)**2)
import math
class Point:
def __init__(self, x, y):
self.x = x
self.y = y
def distance(a, b):
return round(math.sqrt((b.x - a.x)**2 + (b.y - a.y)**2), 6)
def upgrade(num):
if num == 777: money += 200; successes += 1
elif num == 999: money += 100; successes += 1
elif num == 555: money += 50; successes += 1
elif num == 333: money += 15; successes += 1
elif num == 111: money += 10; successes += 1
elif ends77(num) == True: money += 5; successes += 1
elif ends7(num) == True: money += 3; successes += 1
elif ends00(num) == True: money += 2; successes += 1
elif ends0(num) == True: money += 1; successes += 1
else: money -= 1
return money, successes
import pandas as pd
df = pd.read_csv('https://stepik.org/media/attachments/course/4852/accountancy.csv')
def show_all_yandex(df):
return df.loc[df['Executor'] == 'Yandex']
def func(delta, sigsqr, conf):
return round(sigsqr * conf ** 2 / delta ** 2)
def distinct(seq):
return list(set(seq))
def make_dict(my_list):
result = {}
if len(my_list) > 2:
result[my_list[0]] = make_dict(my_list[1:])
else:
result[my_list[0]] = my_list[1]
return result
my_list = ['a', 'b', 'c']
print(make_dict(my_list))
def distance(a, b):
return math.sqrt((a.x - b.x) ** 2 + (a.y - b.y) ** 2)
def decade_of_the_movie(movie_rating)
if movie_rating > 5:
print("Most movies were in the decade")
elif movie_rating <= 4:
print("Most movies were in the decade")
else:
print("No movies were found")
decade_of_the_movie(5)
import pandas as pd
from pandas import DataFrame
sl = [[-0.90068117],
[-1.14301691],
[-1.38535265],
[-1.50652052],
[-1.02184904],
[-0.53717756],
[-1.50652052],
[-1.02184904],
[-1.74885626],
[-1.14301691]]
sw = [[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.]]
data = {'SL': sl, 'SW': sw}
print(pd.DataFrame(data))
def del_col(data_frame):
data_frame.drop(['spi_rank', 'country'], axis=1, inplace=True)
return data_frame
def duplicates(data):
#your code here
result = duplicates(data)
def group_list(grnum):
group_list = []
for key in dct:
if grnum in key:
group_list.append(' '.join(dct[key][0:3]))
group_list.sort()
return group_list
group_list('BST161')
df.loc[(lambda x: x["revenue"] > 40000) & (lambda x: x["total_rooms"] < 30)]
def check_id(lst):
lst = list(set(lst))
count = 0
for i in lst:
if i in lst[i + 1:]:
print(i)
count += 1
if count == 0:
print("Non-repeating elements")
df.plot(kind='box', subplots=True, layout=(4,2), sharex=False, sharey=False, figsize=(10, 10))
plt.show()
def find_questions(df):
df2 = df[df == '?']
return df2
find_questions(data)
data = [['2010-05-21',100000], ['2011-01-30',50000], ['2011-03-16',100000], ['2011-04-11',5000000], ['2011-04-16',50000], ['2011-04-18',50000], ['2011-05-12',100], ['2011-06-23',1000], ['2011-06-26',5000], ['2011-06-29',1000000], ['2011-07-10',5000000], ['2011-09-20',50000], ['2011-09-22',1000000]]
df = pd.DataFrame(data, columns = ['Last Updated', 'Installs'])
def date_to_year(a):
return pd.to_datetime(a).year
df['year'] = df.apply(lambda x: date_to_year(x['Last Updated']), axis=1)
# import pandas
import pandas as pd
# read csv file
df_new = pd.read_csv('file.csv')
# calculate mean of the "date" column for each "operator"
df_new.groupby('operator')['date'].mean()
gdpdiff[['Country or region', 'GDP per capita']].plot(kind='bar', x='Country or region', y='GDP per capita', color='purple')
def grouped_operators(df):
return df.groupby(["operator"])["date"].sum()
import pandas as pd
df = pd.read_csv('studentscores.csv')
def clean_data(df):
for index, row in df.iterrows():
for i, item in enumerate(row):
if type(item) == str:
if '%' in item:
df.iloc[index, i] = item.replace('%', '').replace(',', '')
else:
df.iloc[index, i] = float('NaN')
return df
clean_data(df)
def how_much_water(water, load, clothes):
return water * 1.1 ** (clothes - load)
def sum_all(arr):
return sum([i+j for i in arr for j in arr if i < j])
sum_all(arr)
def find_multiples(integer, limit):
return [integer*i for i in range(1, limit+1) if integer*i <= limit]
def words():
query = []
while True:
word = input("Enter word: ")
if word == "end":
print("Ending")
break
else:
query.append(word)
print(" ".join(query))
words()
df.loc[lambda x: (x['revenue'] > 40000) & (x['total_rooms'] < 30)]
import pandas as pd
import re
s = pd.Series(['10','78','54','GOOD','64','23'])
def find_non_numbers(s):
return s[~s.str.replace('\d+', '').str.contains('\d')]
find_non_numbers(s)
def sale_hotdogs(n):
if n < 5:
return n * 100
elif n < 10:
return n * 95
else:
return n * 90
df = pd.read_csv('customers.csv')
df.sort_values(by='name', inplace=True)
#df[(df['name'].duplicated(keep=False))].sort_values(by='name', inplace=True)
df[(df['name'].duplicated(keep=False))].sort_values(by='name').head(10)
df=pd.read_clipboard()
df2 = df[df=='?']
print("the original dataframe:")
print(df)
print("the new dataframe:")
print(df2)
def mean_str_len(data, column1, column2):
# data - a dataframe
# column1 - a string, the name of a column
# column2 - a string, the name of another column
# YOUR CODE HERE
...
def get_needed_posts(query):
site = pd.DataFrame()
for q in query:
URL = parseurl+'search/'
params = {
'q': q
}
req = requests.get(URL, params=params)
time.sleep(0.3)
soup = BeautifulSoup(req.text)
articles = soup.find_all('article', class_='tm-articles-list__item')
for article in articles:
try:
title = article.find('h2', class_='tm-article').text
date = article.find('span', class_='tm-article').text.strip()
link = article.find('h2', class_='tm-article').find('a').get('href')
if title not in site.title.values and link not in site.link.values:
row = {'date': date, 'title': title, 'link': 'https://habr.com'+link}
site = pd.concat([site
def confidence_interval(n, mean, sig, conf):
sig = sig / (n ** (1/2))
left = round(mean - sig * norm.ppf((1+conf)/2))
right = round(mean + sig * norm.ppf((1+conf)/2))
return right - left
def interval(n, mean, sig, conf):
h = sig * t.ppf((1 + conf) / 2, n - 1)
return round(mean - h), round(mean + h), round(mean)
interval(n=20, mean=3.3, sig=2.4, conf=0.95)
import pandas as pd
df = pd.DataFrame({'time_on_chart': [1,1,1,1],
'max': [11,10,1,20]}, index=['"Groove" Holmes', '"Little" Jimmy Dickens', '"Pookie" Hudson', '"Weird Al" Yankovic'])
df.sort_values(['time_on_chart', 'max'], ascending=False)
import numpy as np
def dia(N):
my_matrix = np.diag(np.arange(N), k=-1)
return np.trace(my_matrix)
print(dia(4))
print(dia(12))
print(dia(15))
def fill_na_by_corr(df, col_name='rectal_temp'):
df_temp = df[df.columns[df.isna().any()].tolist()].drop(['outcome'], axis = 1)
df_temp = df_temp.dropna(subset = [col_name])
cor = df_temp.corr()[[col_name]].drop([col_name])
for i in cor.index:
if cor.loc[i, col_name] > 0:
df[i] = df[i].fillna(value = df[i].mean() * cor.loc[i, col_name])
else:
df[i] = df[i].fillna(value = df[i].mean() / cor.loc[i, col_name])
return df
def add(a, b):
return a + b
add(1, 2)
def gen_id(row):
return str(row.id) + '-' + str(row.hour)
df['id'] = df.apply(gen_id, axis=1)
def rounder(x):
if x < 1:
return 1
if x > 4:
return 4
else:
return round(x)
df2['respiratory_rate'] = df2['respiratory_rate'].apply(rounder)
def is77(x):
str_x = str(x)
if str_x[-2:] == '77' and str_x != '777':
return True
else:
return False
is77(77)
def replace_non_numbers(df, col):
df[col] = df[col].replace(df[col][df[col].str.contains('[^0-9]')], np.nan)
def collatz(number):
if number % 2 == 0:
print(number // 2)
return number // 2
elif number % 2 == 1:
result = 3 * number + 1
print(result)
return result
n = input("Enter number: ")
while n != 1:
n = collatz(int(n))
import re
def convert_size(size):
if size[-1] == 'M':
return int(float(size[:-1]) * 1000000)
elif size[-1] == 'k':
return int(float(size[:-1]) * 1000)
else:
return int(size)
replaces = lambda x: convert_size(x)
data['Size'] = data['Size'].apply(convert_size)
df[df['text'].str.contains('\d{8}-')]
def my_fun(df, var0, var1, var2):
return df.groupby([var0, var1])[var2].max().reset_index()
my_fun(df, 'traffic_source', 'region', 'source_type')
df = pd.DataFrame(data = [[1, 31, 2.5, 1260759144], [1, 1029, 3.0, 1260759179], [1, 1061, 3.0, 1260759182], [1, 1129, 2.0, 1260759185], [1, 1172, 4.0, 1260759205]],
columns = ['userId', 'movieId', 'rating', 'timestamp'])
def average_lifetime(df):
'''
df: input dataframe
'''
df_max = df.groupby(['userId']).max()
df_min = df.groupby(['userId']).min()
df_final = pd.merge(df_max, df_min, on = ['userId'], suffixes = ('_max', '_min'))
df_final['average_lifetime'] = df_final['timestamp_max'] - df_final['timestamp_min']
return df_final
df = average
def round(a):
return int(a + 0.5)
round(2.5)
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Initialize the figure
f, ax = plt.subplots(figsize=(6.5, 6.5))
# Load the example car crash dataset
crashes = sns.load_dataset("car_crashes").sort_values("total", ascending=False)
# Plot the total crashes
sns.set_color_codes("pastel")
sns.barplot(x="total", y="abbrev", data=crashes,
label="Total", color="b")
# Plot the crashes where alcohol was involved
sns.set_color_codes("muted")
sns.barplot(x="alcohol", y="abbrev", data=crashes,
label="Alcohol-involved", color="b")
# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(x
df['country'] = df['country'].astype(str)
def get_dataframe():
sl = [1, 2, 3, 4, 5]
sw = [6, 7, 8, 9, 10]
import pandas as pd
return pd.DataFrame(data = [sl, sw], index = ['sl', 'sw']).transpose()
get_dataframe()
def find_difference(a, b):
return max(a) * max(b)
find_difference([1, 2, 3], [4, 5, 6])
plt.ylim(3,5)
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
return dfp_copy
year_leaders(dfp)
# df2 Pandas dataframe is given
def fillna_median(df2):
df2['rectal_temp'] = df2['rectal_temp'].fillna(df2['rectal_temp'].median())
return df2
fillna_median(df2)
def colmax(df):
maxvalues = []
for col in df.columns:
maxvalues.append(df[col].max())
return pd.DataFrame(maxvalues, index=df.columns).T
from datetime import datetime as dt
from datetime import timedelta as td
def date_range(start_date, end_date):
result = []
if start_date > end_date:
return result
else:
while start_date <= end_date:
result.append(start_date.strftime('%Y-%m-%d'))
start_date += td(days=1)
return result
date_range('1992-09-01', '1992-09-04') # ['1992-09-01', '1992-09-02', '1992-09-03', '1992-09-04']
from math import factorial
def wilson_primes(p):
if (factorial(p-1)+1)%(p*p) == 0 and p>1:
return True
return False
df.groupby('song').first().reset_index()
import pandas as pd
import re
df = pd.DataFrame({'loan_amnt': [5000, 2500, 2400, 10000, 3000, 5000, 7000], 'int_rate': ['10.65%', '15.27%', '15.96%', '13.49%', '12.69%', '7.90%', '15.96%']})
df['rate_group'] = None
def rate_group(a):
if a > 15.00:
return '>15'
elif a >= 10.00 and a <= 15.00:
return '10-15'
else:
return '<10'
df['rate_group'] = df['int_rate'].apply(lambda x: (re.sub('%', '', x)))
df['rate_group'] = df['rate_group'].apply(lambda x: float(x))
df['rate_group'] = df['rate_group'].apply(rate_group)
display(df)
my_string = "How much is this going to cost?"
# returns false
"%" in my_string
# returns true
"%" in "Price: $100%"
def howManyLightSabersDoYouOwn(name="anyone else"):
if name == "Zach":
return 18
else:
return 0
def average_temperature(df):
avg_temp_by_country = df.groupby(['countrry', 'decade'])['av_temp'].mean().reset_index()
coldest_countries = avg_temp_by_country.sort_values('av_temp', ascending = True).head(20)
coldest_countries_list = list(coldest_countries['country'])
return avg_temp_by_country, coldest_countries_list
import uuid
df_new['id'] = df_new.apply(lambda row: uuid.uuid4(), axis=1)
def max_key(dct, key):
#todo
from math import sqrt
def interval(n, mean, sig, conf):
t = 1.96 # for conf = 0.95
h = t * sig/sqrt(n)
return round(h)
interval(100, 6, 2, 0.95)
import pandas as pd
df = pd.read_csv('ratings.csv')
grouped = df.groupby('userId')
top_users = grouped.filter(lambda x: len(x) >= 100)
def multiply_cook_book(portions):
new_book = {}
for key, value in cook_book.items():
new_book[key] = []
for d in value:
new_book[key].append({'ingridient_name': d['ingridient_name'],
'quantity': d['quantity'] * portions,
'measure': d['measure']})
return new_book
print(multiply_cook_book(2))
def get_year(row):
year = row['title'].extract()
return "year"
df.apply(get_year)
import pandas as pd
def find_us(val):
if val.startswith('agg'):
val = 'aggregators'
elif val.startswith('vk_adv') or val.startswith('facebook_adv') or val.startswith('instagram_adv') or val.startswith('telegram_adv'):
val = 'social'
return val
df = pd.read_csv('/datasets/how_find_us.csv')
df['how_find_us'] = df['how_find_us'].apply(find_us)
df['how_find_us'].value_counts()
def year_leaders(df):
leders = df.groupby(by=['chart_debut']).agg({'num_of_hits': 'max'}).reset_index()
print(leders)
return df.merge(leders, left_on=['chart_debut', 'num_of_hits'], right_on=['chart_debut', 'num_of_hits'], how='inner')
year_leaders(df)
def group_by_performer(df):
pass
data = loadmovies()
pdata = pd.DataFrame(data)
grouped = pdata.groupby('userId')['timestamp'].agg(['max', 'min'])
grouped['diff'] = grouped['max'] - grouped['min']
grouped[pdata.groupby('userId')['rating'].count() > 100].mean()
def add_x_axis_labels(fig, x, labels):
fig.update_layout(
xaxis = dict(
tickmode = 'array',
tickvals = x,
ticktext = labels
)
)
return fig
add_x_axis_labels(fig, counts, label)
def show_bar_plot(x, y, title):
fig = px.bar(x=x, y=y, orientation='h')
fig.update_layout(title_text=title)
fig.show()
show_bar_plot(counts, label, 'Your title')
def get_id(x):
return x['user_id'] + '_' + x['item_id']
df['id'] = df.apply(get_id, axis=1)
def get_year(string):
return string.split()[-1].strip(')')
get_year('Pulp Fiction (1994)')
def is_numeric(s):
return bool(re.search(r'^(\d+)$', s))
df1 = pd.DataFrame({"Reviews": ["1", "2", "3", "4", "5", "apple", "orange"]})
df1[~df1["Reviews"].apply(is_numeric)]
import math
class Point:
def __init__(self, x, y):
self.x = x
self.y = y
def distance(a, b):
return math.sqrt((a.x - b.x)**2 + (a.y - b.y)**2)
test.assert_approx_equals(distance(Point(0,0), Point(0,1)), 1)
test.assert_approx_equals(distance(Point(0,0), Point(1,0)), 1)
test.assert_approx_equals(distance(Point(0,0), Point(3,4)), 5)
def how_much_water(water, load, clothes):
if clothes <= load:
return water
else:
return water * 1.1 ** (clothes - load)
def find_duplicates(lst):
stnums = []
cnt = {}
for el in lst:
stnums.append(el[0])
for i in stnums:
if i in cnt:
cnt[i] += 1
else:
cnt[i] = 1
return cnt
def change_duplicates(lst, replacing_num):
stnums = []
duplicates = find_duplicates(lst)
for k, v in duplicates.items():
if v > 1:
stnums = [k]
for student in lst:
if stnums in lst:
#need right code#
def guess_blue(blue_start, red_start, blue_pulled, red_pulled):
total_start = blue_start + red_start
total_pulled = blue_pulled + red_pulled
return (blue_start - blue_pulled) / (total_start - total_pulled)
def first(seq, n=1):
if n == 0:
return []
else:
return seq[:n]
lst = [['a', 1, 3], ['b', 2, 4], ['c', 5, 6]]
{el[0]: el[1:] for el in lst}
fig = px.bar(x=counts, y=label, orientation='h')
fig.update_layout(title_text='Какая рабочая среда для вас наиболее предпочтительна?')
fig.show()
df_new.groupby(['route', 'incident_type']).size()
def filter_1980(df):
df["year"] = df["year"].apply(lambda x: x > 1980)
return df
def avg_temp(df):
return df["av_temp"].mean()
def list_coldest(df, n=20):
df = df.sort_values(by="av_temp")
return list(df.sample(n)["country"])
new_df = filter_1980(df)
avg_temp(new_df)
list_coldest(new_df)
def add_Columns(df):
df.columns = ['A', 'B', 'C', 'D']
return df
add_Columns(df)
import matplotlib.pyplot as plt
fig, axes = plt.subplots(nrows=1, ncols=2)
df.plot(kind='scatter',
x='User_Score',
y='Global_Sales',
ax=axes[0])
df.plot(kind='scatter',
x='Critic_Score',
y='Global_Sales',
ax=axes[1])
plt.show()
def replace_non_numbers(df, column):
df[column] = df[column].astype(str).str.replace('[^0-9]', '')
return df[column].astype(float)
#replace_non_numbers(df, 'loan_amnt')
# pandas dataframe
df = pd.DataFrame({'name': ['Ksenia Rodionova',
'Ulyana Selezneva',
'Konstantin Prokhorov',
'Petrov Vladimir',
'Arina Selivanova',
'Artur Petrov',
'Ivan Sidorov',
'Ksenia Rodionova',
'Ksenia Rodionova'],
'date': ['2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01']})
df.head(10)
df_new = df.loc[df.duplicated(subset=['name'], keep=False), :]
df_new
def multiple_of_index(arr):
if len(arr) <= 1:
return []
else:
new_arr = []
for i in range(len(arr)):
if i * arr[i] == 0 and arr[i] != 0:
new_arr.append(arr[i])
return new_arr
def vertical_bar_chart(question):
question = data[question].value_counts()
label = question.index
counts = question.values
colors = ['gold', 'lightgreen']
fig = go.Figure(data=[go.Bar(x=label, y=counts, marker_color=colors)])
fig.update_layout(title_text=question)
fig.show()
vertical_bar_chart(question6)
import json
def normal_split(data):
return data.split(',')
purchases = {}
for i, line in enumerate(f):
line = json.loads(line.strip())
keys = line['user_id']
values = line['category']
purchases[keys] = values
def replace_number(lst):
for row in lst:
if row[0] == "4004":
row[0] = "9090"
return lst
def decade(year):
if year >= 1900 and year <= 1910:
return "1900-1910"
elif year > 1910 and year <= 1920:
return "1910-1920"
elif year > 1920 and year <= 1930:
return "1920-1930"
elif year > 1930 and year <= 1940:
return "1930-1940"
elif year > 1940 and year <= 1950:
return "1940-1950"
elif year > 1950 and year <= 1960:
return "1950-1960"
elif year > 1960 and year <= 1970:
return "1960-1970"
elif year > 1970 and year <= 1980:
return "1970-1980"
elif year > 1980 and year <= 1990:
return "1980-1990"
elif year > 1990 and year <= 2000:
return "1990-2000"
elif year > 2000 and year <= 2010:
return "2000-2010"
elif year > 2010 and year <= 2020:
return "2010-2020
def round_to_nearest(a):
return round(a)
round_to_nearest(2.5)
def rate_group(int_rate):
int_rate = re.sub('%', '', int_rate)
int_rate = float(int_rate)
if int_rate > 15.0:
return '>15'
elif 10.0 < int_rate <= 15.0:
return '10-15'
else:
return '<10'
df['rate_group'] = df['int_rate'].apply(rate_group)
def is_opposite(s1, s2):
if not s1 and not s2:
return False
if s1 and s2:
return s1.swapcase() == s2
return False
def rand_seq(length):
import random
seq = ""
while True:
if "3" in seq and "7" in seq:
break
seq = "".join([str(random.randint(0, 9)) for num in range(length)])
return seq
rand_seq(15)
def plot_difference_by_hotel(df):
sns.factorplot(
x='date', y='difference', col='hotel',
data=df, kind='bar')
plot_difference_by_hotel(hotels_rev)
def num_of_hits(df):
df['num_of_hits'] = df.hits.apply(lambda x: len(x.split(', ')))
return df.sort_values(by='num_of_hits', ascending=False)
num_of_hits(df)
def find_max(dictionary):
"""
Dictionary -> String
:param dictionary: example {'AUD': {'ID': 'R01010',
'NumCode': '036',
'CharCode': 'AUD',
'Nominal': 1,
'Name': 'Australian Dollar',
'Value': '46.9983,
'Previous': 45.9496},
'AZN': {'ID': 'R01020A',
'NumCode': '944',
'CharCode': 'AZN',
'Nominal': 1,
'Name': 'AZN',
'Value': 41.4856,
'Previous': 40.5904},
'GBP': {'ID': 'R01035',
'NumCode': '826',
'CharCode': 'GBP',
'Nominal': 1,
'Name': 'Pound Sterling United Kingdom',
'Value': 85.
def dish(dish, portions):
cook_book = {
'пицца': [
{'ingridient_name': 'сыр', 'quantity': 20, 'measure': 'гр'},
{'ingridient_name': 'колбаса', 'quantity': 30, 'measure': 'гр'},
{'ingridient_name': 'бекон', 'quantity': 30, 'measure': 'гр'},
{'ingridient_name': 'оливки', 'quantity': 10, 'measure': 'гр'},
],
'лимонад': [
{'ingridient_name': 'лимон', 'quantity': 1, 'measure': 'шт'},
{'ingridient_name': 'вода', 'quantity': 200, 'measure': 'мл'},
{'ing
def find_obj_str(df):
obj_str = df.select_dtypes(['object', 'str'])
return obj_str
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10,5))
ax[0].plot(df[['User_Score', 'Global_Sales']],
df['Global_Sales'].max())
ax[1].plot(df[['Critic_Score', 'Global_Sales']],
df['Global_Sales'].max())
import pandas as pd
df = pd.DataFrame(
{'id': [4728, 35638, 21445, 40291, 29462, 46978, 42931, 38670, 25506, 43989],
'title': ['Bad Education (2019)', 'Palooka (1934)', 'High Moon (2019)', 'Saint Maud (2019)',
'Mad at the Moon (1992)', 'The Butterfly Ball (1977)',
'Snowboarďáci (2004)', 'Recon 2020: The Caprini Massacre (2004)',
'Karagoez catalogo 9,5 (1983)', 'Starting a Skyscraper (1902)'],
'rating': [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
'decade': ['2010-2020', '1934', '2010-2020', '2010-2020',
'1990-2000', '1970-1980', '
def group(df):
return df.groupby(['performer'])['song'].apply(', '.join).reset_index()
def get_source_type(traffic_source, region):
return df[(df.traffic_source == traffic_source) & (df.region == region)].source_type.values
get_source_type('twitter', 'east')
def make_numeric(df):
for col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
return df
def sort_by(df, column, ascending=True):
return df.sort_values(column, ascending=ascending)
def multiply(dictionary):
for i in dictionary[key]:
i['quantity'] = i['quantity'] * portions
return dictionary
multiply(cook_book)
year_leaders = df.groupby('chart_debut').num_of_hits.transform(max) == df['num_of_hits']
df[year_leaders][['performer', 'num_of_hits']]
def to_binary(n):
if n < 2:
return str(n)
else:
return to_binary(n // 2) + str(n % 2)
import pandas as pd
df_ru = pd.read_csv('../data/ft_ru.csv', parse_dates=[0], index_col=[0], dayfirst=True)
def hot_years(df):
df['year'] = df.index.year
hot_years = df.resample('A').mean()
hot_years = hot_years[hot_years['av_temp'] > 15]
return hot_years
hot_years(df_ru)
from sklearn.preprocessing import StandardScaler
def scale(sl, sw):
scaler = StandardScaler()
return scaler.fit_transform(sl, sw)
scale(sl, sw)
def replace(stnum, students, replacing_num):
result = students.copy()
for elem in students:
if elem[0] == stnum:
for i in range(1, len(elem)):
elem[i] = replacing_num
return result
stnums = ['4004']
students = [
['0001', 'Antonov', 'Anton', 'Igorevich', '20.08.2009', 'BST161'],
["1102", "Bogov", "Artem", "Igorevich", "25.01.2010", "BST162"]
["0333", "Glagoleva", "Anastasiya", "Nikolaevna", "11.07.2009", "BST163"]
["4004", "Stepanova", "Natalia", "Aleksandrovna", "13.02.2008", "BST161"]
["0045", "Bokov", "Igor", "Kharitonovich", "02.06.2009", "BST161
def avgrating(df):
#your code here
return df
def same_case(a, b):
if a.isalpha() and b.isalpha():
if a.islower() and b.islower():
return 1
elif a.isupper() and b.isupper():
return 1
else:
return 0
else:
return -1
same_case('a', 'g') # 1
same_case('A', 'C') # 1
same_case('b', 'G') # 0
same_case('B', 'g') # 0
same_case('0', '?') # -1
def hot_years(df):
hot_years=df.groupby('year').mean()
hot_years=hot_years.loc[hot_years.av_temp>15]
return hot_years
hot_years(df_ru)
def duplicates(df):
return df[df.name.duplicated(keep=False)].sort_values('name')
duplicates(df)
def search_goods():
global directories
input_number = input('Input directory number: ')
for key, value in directories.items():
for item in value:
if input_number == item:
return key
return 'Nothing found'
import pandas as pd
import numpy as np
r_cols = ['userId','movieId','rating','timestamp']
ratings = pd.read_csv('ml-latest-small/ratings.csv',usecols=r_cols)
#your code here
ratings
def convert_to_year(x):
try:
date = pd.to_datetime(x)
return date.year
except:
return None
df['year'] = df.date.apply(convert_to_year)
def to_binary(n):
#your code here
return bin(n)
def count_list(list):
count_list = {}
for i in list:
if i in count_list:
count_list[i] += 1
else:
count_list[i] = 1
return count_list
count_list(a)
def transpose(matrix):
'''
Transpose a matrix
'''
matrix_t = []
for i in range(len(matrix[0])):
row = []
for j in range(len(matrix)):
row.append(matrix[j][i])
matrix_t.append(row)
return matrix_t
def change_quantity(ingridient, portions):
return ingridient['quantity'] * portions
change_quantity(ingridient, portions)
import plotly.express as px
def horizontal_bar(question, title):
question = df[question].value_counts()
label = question.index
counts = question.values
fig = px.bar(x=label, y=counts, orientation='h')
fig.update_layout(title_text=title)
fig.show()
horizontal_bar('What is the most preferred working environment for you?', 'Какая рабочая среда для вас наиболее предпочтительна?')
data = data.sort_values(by = ['max'], ascending = False).head(20)
data
import pandas as pd
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols,encoding='latin-1')
ratings = ratings.drop('unix_timestamp', axis=1)
ratings.head()
import re
def rate_group(int_rate):
int_rate = re.sub('%', '', int_rate)
int_rate = float(int_rate)
if int_rate > 15.0:
return '>15'
elif 10.0 < int_rate <= 15.0:
return '10-15'
else:
return '<10'
df['rate_group'] = df['int_rate'].apply(rate_group)
def change_shelf(data):
docnum = '11-2'
shelf = '3'
if list(docnum) in data.values():
print('Error')
for key, val in data.items():
for doc in val:
if doc == docnum:
data[shelf].append(doc)
data[key].remove(doc)
print('OK')
return data
from scipy import stats
from math import sqrt
def interval(n, mean, sig, conf):
h = sig * stats.norm.ppf((1 + conf) / 2) / sqrt(n)
return int(2 * h)
def sum_list(lst):
sum = 0
for i in range(len(lst)):
for j in range(len(lst)):
if i != j:
sum += lst[i] + lst[j]
return sum
sum_list([2, 1, 10, 5])
df[df['loan_amnt'].str.contains('[a-z]', flags=re.IGNORECASE, regex=True)]
df[['performer', 'time_on_chart']].groupby('performer').agg({'min', 'max'}).sort_values('max', ascending = False)
class Point:
def __init__(self, x, y):
self.x = x
self.y = y
def distance(a, b):
return math.sqrt((a.x - b.x)**2 + (a.y - b.y)**2)
df.plot(kind='box', subplots=True, layout=(4,2), sharex=False, sharey=False)
plt.rcParams.update({'font.size': 14})
plt.rcParams["axes.linewidth"] = 3
plt.show()
import matplotlib.pyplot as plt
def linegraph(df):
plt.plot(df.index, df['2015'], label = "2015")
plt.plot(df.index, df['2016'], label = "2016")
plt.plot(df.index, df['2017'], label = "2017")
plt.plot(df.index, df['2018'], label = "2018")
plt.plot(df.index, df['2019'], label = "2019")
plt.legend()
plt.show()
linegraph(df)
from sklearn.metrics import f1_score
y_pred = lda_model.predict(X_val)
f1_score(y_val, y_pred)
import pandas as pd
import datetime
import numpy as np
df_ratings = pd.read_csv('C:/Users/User/Downloads/ml-latest-small/ratings.csv')
def diff_pd(x):
return x.max() - x.min()
df_ratings.groupby('userId')['timestamp'].agg([diff_pd]).mean()
def count5_decade(df):
df = df[df.rating == 5.0]
return df.decade.value_counts()
count5_decade(ratings)
def move(directories, str1, str2):
# write your code here
dic = directories
if str1 not in dic:
return 'ERROR NO SUCH KEY'
if str2 not in dic:
dic[str2] = []
if str1 in dic:
if str2 not in dic[str1]:
return 'ERROR NO SUCH VALUE'
dic[str2].append(str2)
for i in range(len(dic[str1])):
if str2 in dic[str1][i]:
dic[str1].remove(str2)
return dic
print(move(directories, '11-2', '3'))
import pandas as pd
df = pd.read_csv('data/charts.csv')
dfs = df.copy()
def chart_debut_format(chart_debut):
return chart_debut[:4]
dfs['chart_debut'] = dfs['chart_debut'].apply(chart_debut_format)
dfs.head()
def multiple_of_index(arr):
return [num for i, num in enumerate(arr) if num % i == 0 and i != 0]
multiple_of_index([22, -6, 32, 82, 9, 25])
def merge_df(df1, df2):
# Combine the dataframes on client_id
return df1.merge(df2, on='client_id')
merge_df(rzd, auto)
df_new.id = df_new.id.astype(int)
def date_range(start_date, end_date):
lst = []
if start_date > end_date:
return []
else:
try:
start = (dt.strptime(start_date, '%Y-%m-%d'))
end = (dt.strptime(end_date, '%Y-%m-%d'))
lst.append(start.strftime('%Y-%m-%d'))
start += td(days=1)
return lst
except:
return []
import math
class Point(object):
def __init__(self, x=0, y=0):
self.x = x
self.y = y
def distance(self, other):
return math.sqrt(math.pow((other.x - self.x),2) + math.pow((other.y - self.y),2))
installs.plot(kind='bar',
title='Топ 10 издателей по продажам видеоигр',
xlabel='Издатель',
ylabel='Количество игр',
logy=True)
def count(df):
return df[df == '?'].count()
def sum_all(arr):
return sum([x+y for x in arr for y in arr])
sum_all([2, 1, 10, 5])
def extract_year(title):
return title.split('(')[1].split(')')[0]
df['year'] = df['title'].apply(extract_year)
def make_list(direct):
return ','.join(list(direct.keys()))
make_list(directories)
def is_acceptable_password(password: str) -> bool:
return len(password) > 6 and any(i.isdigit() for i in password) and not any(j.isdigit() for j in password[-9:])
def sales_df(regions, na_sales, eu_sales, jp_sales, oth_sales):
return df
sales_df(regions, na_sales, eu_sales, jp_sales, oth_sales)
df.groupby('userId').filter(lambda x: len(x) >= 100)
df.groupby('userId').mean()
# import pandas
import pandas as pd
# import the data
data = pd.read_csv('https://s3.amazonaws.com/assets.datacamp.com/production/course_2023/datasets/imdb_1000.csv')
# check the data
data
# check the shape of the data
data.shape
# drop the rows with null values
data.dropna(inplace=True)
# drop the rows with null values
data.dropna(inplace=True)
# check the shape of the data
data.shape
# find the best decade
data.groupby('decade')['rating'].mean().sort_values(ascending=False)
import math
def sample_size(delta, sigma, conf):
z = 0.5 * (1 + conf)
return math.ceil(2 * (z ** 2) * (sigma ** 2) / (delta ** 2))
sample_size(0.1, 0.3, 0.95)
def decade(x):
if x.isdigit():
decade = int(x)
decade = decade/10
decade = int(decade)
decade = decade*10
return str(decade) + "-" + str(decade+10)
else:
return float('NaN')
df['Decade of Release'] = df['Year'].apply(decade)
df
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
return dfp_copy
year_leaders(dfp)
def object_finder(dataframe, column):
return dataframe[dataframe[column].apply(lambda x: type(x) == object)]
object_finder(data, 'values')
from plotly.subplots import make_subplots
fig = make_subplots(rows=1, cols=2)
fig.add_trace(px.bar(x=label, y=counts, orientation='v'), 1, 1)
fig.add_trace(px.bar(x=label, y=counts, orientation='v'), 1, 2)
fig.update_layout(title_text='Какая рабочая среда для вас наиболее предпочтительна?',
showlegend=False)
fig.show()
def year_leaders(df):
df2 = df[["performer", "hits"]].groupby(df['chart_debut']).max()
return df2
def magic(arr):
result = 0
for i in range(len(arr)):
for j in range(i+1, len(arr)):
result = result + (arr[i] + arr[j])
return result
magic(arr)
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
return dfp_copy
year_leaders(dfp)
def calculate_respiratory_rate(pulse, respiratory_rate):
if respiratory_rate == respiratory_rate:
return respiratory_rate
else:
return pulse/pulse + 0.5
calculate_respiratory_rate(60, np.nan)
calculate_respiratory_rate(60, 12)
# Update x axis
fig.update_xaxes(
title_text="Количество ответов",
tickvals=counts,
ticktext=label
)
# Set y-axis title
fig.update_yaxes(title_text="Ваш выбор")
fig.show()
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
print(dfp_copy)
return dfp_copy
def replace_numbers(stnum, stus, repnum):
for num, stu in enumerate(stus):
if stu[0] == stnum:
stu[0] = repnum
return stus
replace_numbers('4004', students, '9090')
def check_password(string):
if "password" in string:
return True
else:
return False
check_password("asdfasdfpasswordasdfasdf")
def checkio(delta, sigsqr, conf):
# Your code here
# It's main function. Don't remove this function
# It's using for auto-testing and must return a result for check.
import scipy.stats as st
import numpy as np
n = ((st.norm.ppf((1+conf)/2))**2*sigsqr)/(delta**2)
return int(n)
# Some hints
# You can use stats.norm.ppf function for the normal distribution
#These "asserts" using only for self-checking and not necessary for auto-testing
if __name__ == '__main__':
assert checkio(0.02, 0.04, 0.95) == 491
assert checkio(0.02, 0.005, 0.95) == 4127
print("Coding complete? Click 'Check' to earn cool rewards!")
def year_leaders(df):
pass
def total_ingridients():
dish = 'salad'
portions = 5
cook_book = {'salad': [{'ingridient_name': 'cheese', 'quantity': 50, 'measure': 'g'},
{'ingridient_name': 'tomatoes', 'quantity': 2, 'measure': 'pcs'},
{'ingridient_name': 'cucumbers', 'quantity': 100, 'measure': 'g'}]}
print(f'{dish}')
for ing in cook_book[dish]:
print(f'{ing["ingridient_name"]}: {ing["quantity"]*portions}{ing["measure"]}')
def group_by_two_columns(dataframe, col1, col2, col3):
return df.groupby(['col1','col2'])['col3'].value_counts()
def year_leaders(df):
return df.groupby(['chart_debut'])['num_of_hits'].max()
def is_acceptable_password(password: str) -> bool:
if len(password) > 6:
if len(password) > 9:
return True
if password.isdigit():
return False
else:
return True
else:
return False
import numpy as np
def sum_matrix(N):
my_matrix = np.diag(np.arange(N-1, -1, -1), k=0)
return sum(my_matrix.diagonal())
print(sum_matrix(5))
print(sum_matrix(10))
print(sum_matrix(15))
def highlight(df):
return df.style.highlight_max(axis=1,
props='color:white;\
font-weight:bold;\
background-color:green;')
return df.style.highlight_min(axis=1,
props='color:white;\
font-weight:bold;\
background-color:brown;')
#your code here
def new_doc_add():
docnum = input('Enter the number of your document: ')
doctype = input('Enter the type of your document: ')
docowner = input('Enter the owner of your document: ')
shelf = input('Enter the shelf number: ')
documents.append({'type': doctype, 'number': docnum, 'name': docowner})
if shelf in directories:
directories[shelf].append(docnum)
else:
directories[shelf] = [docnum]
song performer chart_debut peak_position worst_position time_on_chart consecutive_weeks hits
Stupid Cupid Connie Francis 1958-08-02 17 72 12 11.0 Stupid Cupid
Chantilly Lace Big Bopper 1958-08-02 6 40 18 17.0 Chantilly Lace
Chantilly Lace Big Bopper 1958-08-02 6 40 19 18.0 Chantilly Lace
Chantilly Lace Big Bopper 1958-08-02 6 40 20 19.0 Chantilly Lace
Chantilly Lace Big Bopper 1958-08-02 6 40 21 20.0 Chantilly Lace
def create_sales_df(df):
return pd.DataFrame({'regions': ['North America', 'Europe', 'Japan', 'Other'],
'sales': [df['NA_Sales'].sum(), df['EU_Sales'].sum(), df['JP_Sales'].sum(), df['Other_Sales'].sum()]})
create_sales_df(df)
import re
def remove_quotes(d):
for k, v in d.items():
k = re.sub(r" \"", "", k)
v = re.sub(r" \"", "", v)
d[k] = v
return d
remove_quotes({' "user_id"': ' "category"}', ' "1840e0b9d4"': ' "Products"}'})
from datetime import date, timedelta as td
def date_range(start_date, end_date):
lst = []
if start_date > end_date:
return []
else:
try:
start = (dt.strptime(start_date, '%Y-%m-%d'))
end = (dt.strptime(end_date, '%Y-%m-%d'))
while start <= end:
lst.append(start.strftime('%Y-%m-%d'))
start += td(days=1)
return lst
except:
return []
def millionaire(df):
rich = df[df.annual_inc >= 1000000]
return rich
millionaire(df)
def divide_hotels(df):
df['big_hotels'] = df.apply(lambda x: x['profit'] if x['total_rooms'] > 30 else 0, axis=1)
df['medium_hotels'] = df.apply(lambda x: x['profit'] if x['total_rooms'] <= 30 and x['total_rooms'] > 20 else 0, axis=1)
df['small_hotels'] = df.apply(lambda x: x['profit'] if x['total_rooms'] <= 20 and x['total_rooms'] > 10 else 0, axis=1)
return df
df = divide_hotels(df)
df.head()
geo_data = {'Center': ['Moscow', 'Tula', 'Yaroslavl'], 'Northwest': ['Petersburg', 'Pskov', 'Murmansk'], 'Far East': ['Vladivostok', 'Sakhalin', 'Khabarovsk']}
def geo_class(city):
for region in geo_data:
if city in geo_data[region]:
return region
geo_class('Pskov')
def my_evaluation(x_train, y_train, x_test, y_test):
# your code
return f1, precision, recall
# your code here
def count_sources_per_region(df):
return df.groupby(['region'])['traffic_source'].count()
df2['respiratory_rate'] = df2['respiratory_rate'].fillna(
df2.groupby('pulse')['respiratory_rate'].transform('median'))
df2.groupby('pulse')['respiratory_rate'] = round(df2.groupby('pulse')['respiratory_rate'])
def replace_num(stnums, students, replacing_num):
for num in stnums:
for st in students:
if st[0] == num:
st[1] = replacing_num
return students
replace_num(stnums, students, replacing_num)
def glue(x):
str_ = ''
for i in x:
str_+=str(i)
return str_
glue(lst)
def transpose_matrix(A):
rows = len(A)
cols = len(A[0])
B = [[0 for i in range(0, rows)] for j in range(0, cols)]
for i in range(0, rows):
for j in range(0, cols):
B[j][i] = A[i][j]
return B
def to_csv_string(array):
return '\n'.join(','.join(map(str, x)) for x in array)
class Designer(Employee):
def __init__(self, name, seniority, awards=2):
super().__init__(name, seniority)
self.intlawards = awards
def check_if_it_is_time_for_upgrade(self):
# for each accreditation, increase the counter by 1
# for now we assume that all designers are accredited
self.seniority += 1
# condition for promoting an employee from the presentation
if (self.seniority + self.intlawards) % 7 == 0:
self.grade_up()
# publication of the results
return self.publish_grade()
def find_values(df):
column = df['values']
return column.str.contains('a')
def bar_graphs(df1, df2):
plt.figure(figsize=(15,7))
plt.subplot(2,2,1)
sns.barplot(df1['city'].value_counts().index, df1['city'].value_counts().values)
plt.xticks(rotation=40)
plt.title('City')
plt.subplot(2,2,2)
sns.barplot(df2['hotel'].value_counts().index, df2['hotel'].value_counts().values)
plt.xticks(rotation=40)
plt.title('Hotel')
plt.subplot(2,2,3)
sns.barplot(df2['date'].value_counts().index, df2['date'].value_counts().values)
plt.xticks(rotation=40)
plt.title('Date')
plt
def round_to_four(a, b):
return round(a + b)
round_to_four(1.2, 3.1)
def year_leaders(df):
df=df.groupby(df.chart_debut).apply(lambda x: x.sort_values('num_of_hits',ascending=False)).reset_index(drop=True)
df=df.groupby('chart_debut').head(1)
return df
year_leaders(df)
# Добавляем колонку для проверки
df['duplicate_name'] = df['name'].duplicated()
# Проверяем колонку и выводим на экран
df [df['duplicate_name'] == True]
def group_by_title(df):
return df.groupby('title')['rating'].mean()
def powers_of_two(n):
return [2**x for x in range(n+1)]
def change_col_type(df, col_type):
try:
return df.astype(col_type)
except:
pass
df2 = change_col_type(df, 'float')
def geo_class(row):
geo_data = {'center': ['Moscow', 'Tula', 'Yaroslavl'],
'Northwest': ['petersburg', 'pskov', 'murmansk'],
'Far East': ['vladivostok', 'sakhalin', 'khabarovsk']}
for key in geo_data.keys():
for value in geo_data.values():
if value in row:
return key
return 'undefined'
df4['loyal_profit'] = df4.apply(lambda x: (x.profit / x.ocup_rooms), axis=1)
df = pd.DataFrame({'grade': ['A','B','C','D','E','F'], 'rate_group': ['7-8%', '10-11%', '12-13%', '15-17%', '17-25%', '17-25%', '17-25%'], 'id': [1077501, 1077430, 1077175, 1076863, 1075358, 1075269, 1069639, 1072053, 1071795, 1071570]})
df.set_index(['id', 'rate_group'], inplace=True)
df.unstack(level='rate_group')
def get_quantity(cook_book, key, portions):
grams = 0
if key == 'salad':
grams = cook_book[key][0]['quantity'] * portions
print(f"{cook_book[key][0]['ingridient_name']}: {grams} {cook_book[key][0]['measure']}")
grams = cook_book[key][1]['quantity'] * portions
print(f"{cook_book[key][1]['ingridient_name']}: {grams} {cook_book[key][1]['measure']}")
grams = cook_book[key][2]['quantity'] * portions
print(f"{cook_book[key][2]['ingridient_name']}: {grams} {cook_book[key][2]['measure']}")
import pandas as pd
df = pd.DataFrame({
'user_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
'name': ['Ksenia Rodionova', 'Ulyana Selezneva', 'Konstantin Prokhorov', 'Petrov Vladimir', 'Arina Selivanova', 'Svetlana Kuznecova', 'Evgeniy Laptev', 'Ivan Ryzhkov', 'Sidorov Nikolay', 'Nikolay Ivanov', 'Natalya Volkova', 'Maksim Petrov', 'Maksim Petrov', 'Viktor Fomichev', 'Ulyana Selezneva'],
'date': ['2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01', '2021-07-02', '2021-07-02', '2021-07-02', '
def round_up(x):
if x - math.floor(x) < 0.5:
return math.floor(x)
return math.ceil(x)
df2['respiratory_rate'] = df2['respiratory_rate'].fillna(
df2.groupby('pulse')['respiratory_rate'].transform(round_up))
def get_year(title):
return int(title.split()[-1][1:-1])
df['year'] = df['title'].apply(get_year)
def find_difference(a, b):
return abs(reduce(lambda x, y: x*y, a) - reduce(lambda x, y: x*y, b))
import math
def nearest_sq(n):
return round(math.sqrt(n)) ** 2
def top20(df):
gdpdiff = pd.DataFrame({'Top1': df[['Country or region', 'GDP per capita']]\
.sort_values(by='GDP per capita', ascending=False).head(20)[0:1],
'Top20': df[['Country or region', 'GDP per capita']]\
.sort_values(by='GDP per capita', ascending=False).head(20)[19:20]})
return gdpdiff
top20(df19)
def data_clean(df):
df['chart_debut'] = df['chart_debut'].str.split('-').str[0]
return df
data_clean(df)
mean_c.sort_values(ascending=False)
import pandas as pd
ratings = pd.read_csv('ratings.csv')
def aver_lifetime(data):
data['date'] = pd.to_datetime(data['timestamp'], unit='s').dt.date
lifetimes = data.groupby('userId')['date'].agg(['min', 'max'])
lifetimes['lifetime'] = lifetimes['max'] - lifetimes['min']
return lifetimes['lifetime'].mean()
aver_lifetime(ratings)
def assignDecade(x):
if type(x) is str:
return np.nan
else:
return (str(int(x[:3]))+"0-") + (str(int(x[:3])+1)+"0")
df['Decade of Release'] = df['Year'].map(assignDecade)
df
def round_nearest_int(x):
return round(x)
def sample_size(error, sigsqr, conf):
return (error**2)*sigsqr/(1.96**2)
def highlight_min_max(df1):
return df1.style.highlight_max(axis=1,
props='color:white;\
font-weight:bold;\
background-color:green;').apply(highlight_min, axis=1,
props='color:white;\
font-weight:bold;\
background-color:brown;')
highlight_min_max(df2)
def replace_nums(students, stnums, replacing_num):
for st in students:
for stnum in stnums:
if stnum in st:
st.remove(stnum)
st.append(replacing_num)
return students
print(replace_nums(students, stnums, replacing_num))
def f(dframe):
years = dframe.columns.get_level_values(0).get_level_values(0).unique()
fig = plt.figure()
ax = fig.add_subplot(111)
for col in dframe.columns:
dframe[col].plot(kind='line', ax=ax)
plt.xticks(years)
plt.show()
f(df)
import pandas as pd
# Create DataFrame
df = pd.DataFrame({'spi_rank' : [21,19,20], 'country': ['Australia', 'Canada', 'Chile']})
def function(df):
del df['spi_rank']
del df['country']
return df
function(df)
regions = ['North America', 'Europe', 'Japan', 'Other']
sales = [4402.62, 2424.67, 1297.43, 791.34]
plt.pie(sales, labels=regions, autopct='%1.1f%%')
plt.show()
from sklearn import datasets
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def load_iris():
iris = datasets.load_iris()
return iris
iris = load_iris()
#create a list
arr = [2, 1, 10, 5]
def sum_from_list(arr):
result = []
for idx, num in enumerate(arr):
for idx_2, num_2 in enumerate(arr):
if idx_2 > idx:
result.append(num + num_2)
return result
sum_from_list(arr)
def interval(n, mean, sig, conf):
h = sig * stats.norm.ppf((1 + conf) / 2) / (n ** 0.5)
return int(round(h * 2))
import pandas as pd
import numpy as np
df = pd.DataFrame({'rate_group': ['new', 'new%', 'old', 'old%', 'new%']})
print(df)
def change_dtype(df):
for col in df.columns:
try:
df[col] = df[col].astype(float)
except:
pass
return df
df = change_dtype(df)
import matplotlib.pyplot as plt
df.plot(x='year')
plt.show()
df = pd.DataFrame([['a', 1, 2], ['a', 2, 3], ['b', 2, 3]], columns = ['performer', 'min', 'max'])
def sort_by_max(df, groupby_column, column_to_sort):
return df[[groupby_column, column_to_sort]].groupby(groupby_column).agg({'min', 'max'}).sort_values(by=column_to_sort)
sort_by_max(df, 'performer', 'max')
def get_shop_list_by_dishes(dishes, person_count):
cook_book = {
'salad': [
{'ingridient_name': 'cheese', 'quantity': 50, 'measure': 'gr'},
{'ingridient_name': 'tomatoes', 'quantity': 2, 'measure': 'pct'},
{'ingridient_name': 'pepper', 'quantity': 20, 'measure': 'gr'}],
'cucumbers': [
{'ingridient_name': 'cucumbers', 'quantity': 20, 'measure': 'gr'},
{'ingridient_name': 'pepper', 'quantity': 8, 'measure': 'gr'},
{'ingridient_name': 'olives', 'quantity': 8, 'measure': 'gr'},
{'ingridient_name': 'olive oil', 'quantity': 30, 'measure': 'ml'}],
'olives': [
def av_revenue(df):
df['av_revenue'] = df.groupby('hotel')['revenue'].transform('mean')
return df
def difference(df):
df['difference'] = df['av_revenue'] - df['revenue']
return df
def in_percent(df):
df['in_percent'] = 100 * df['difference'] / df['av_revenue']
return df
import pandas as pd
def decade(year):
if year < 1910:
return str(year - year%10) + '-' + str(year - year%10 + 10)
elif year > 2009:
return str(year - year%10) + '-' + str(year - year%10 + 10)
else:
return str(year - year%10) + '-' + str(year - year%10 + 9)
df['Decade of Release'] = df['Year'].map(decade)
import pandas as pd
list = [['/world/'],
['/latest/'],
['/?updated=top'],
['/politics/36188461-s-marta-zhizn-rossiyan-suschestvenno-izmenitsya-iz-za-novyh-zakonov/']
['/world/36007585-tramp-pridumal-kak-reshit-ukrainskiy-vopros/'],
['/science/36157853-nasa-sobiraet-ekstrennuyu-press-konferentsiyu-na-temu-vnezemnoy-zhizni/'],
['/video/36001498-poyavilis-pervye-podrobnosti-gibeli-natali-melamed/'],
['/world/36007585-tramp-pridumal-kak-reshit-ukrainskiy-vopros/?smi2=1']
['/science/
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
return dfp_copy
year_leaders(dfp)
def rate_group(rate):
if rate > 15.00:
return '>15'
elif rate <= 15.00 and rate > 10.00:
return '10-15'
def search(query):
query = query.lower()
site = pd.DataFrame()
for page in range(0, 10):
if page == 0:
url = 'https://habr.com/ru/all/'
else:
url = 'https://habr.com/ru/all/page' + str(page) + '/'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
articles = soup.find_all('li', class_='content-list__item_post')
site = pd.concat([site, get_needed_posts(query, articles)])
return site.reset_index(drop=True)
def get_needed_posts(query, articles):
site = pd.DataFrame()
for article in articles:
title, date, link = get_article_info(article)
if not title in list(site['title']) and not link in list(site['link']):
row = {'
import pandas as pd
ratings = pd.read_csv('ratings.csv')
filtered_ratings = ratings[['title', 'rating']]
filtered_ratings = filtered_ratings.groupby('title').mean()
filtered_ratings
import pandas as pd
df = pd.read_csv('ratings.csv')
df.groupby('userId').size().to_frame('size').reset_index()
import numpy as np
def sum_matrix(N):
my_matrix = np.diag(np.arange(N-1, -1, -1), k=0)
return np.trace(my_matrix)
print(sum_matrix(5))
print(sum_matrix(10))
print(sum_matrix(15))
df = df[df.duplicated(subset=["name"], keep=False)].sort_values("name")
def create_plot(question, title, figure_template):
# question = 'What is the most preferred working environment for you.'
question6 = df[question].value_counts()
label = question6.index
counts = question6.values
fig = figure_template(x=label, y=counts)
fig.update_layout(title_text=title)
fig.show()
create_plot('What is the most preferred working environment for you.', 'Какая рабочая среда для вас наиболее предпочтительна?', px.bar)
def top(df, column_name):
return df.sort_values(by=column_name, ascending=False)[:20]
top(df, 'av_temp')
def is_month_end(date):
# Your code goes here.
is_month_end(date)
import pandas as pd
df = pd.read_csv('news.csv', delimiter='\t')
def filter_news(news_title):
if news_title.startswith('/') and news_title.count('/')==2 and news_title[-1].isdigit() and not news_title.endswith('/'):
return True
else:
return False
df.news_title.apply(filter_news)
def panda_function(df):
df['loyal_profit'] = df.apply(lambda x: (x.profit / x.ocup_rooms) if ['regular_customer', 'by_recommendation'] in x.how_find_us else None, axis=1)
question6 = 'What is the most preferred working environment for you.'
question6 = df[question6].value_counts()
label = question6.index
counts = question6.values
fig = px.bar(x=label, y=counts)
fig.update_layout(title_text='Какая рабочая среда для вас наиболее предпочтительна?')
fig.show()
def find_us(df):
df.loc[df.hotel=='Alpina', 'how_find_us'].value_counts()
df.loc[df.hotel=='Alpina', 'how_find_us'] = df.loc[df.hotel=='Alpina', 'how_find_us'].map(lambda x: 'aggregators' if 'agg' in x else x)
df.loc[df.hotel=='Alpina', 'how_find_us'] = df.loc[df.hotel=='Alpina', 'how_find_us'].map(lambda x: 'social' if 'facebook' in x or 'vk' in x or 'instagram' in x or 'telegram' in x else x)
return df
df_new.groupby(['route', 'incident_type']).count()['id']
df_new.groupby(['route', 'incident_type'])['id'].count()
df_new.groupby(['route', 'incident_type'])['id'].size()
df_new[df_new['incident_type'] >= 5].groupby(['route', 'incident_type']).count()['id']
df_new[df_new['incident_type'] >= 5].groupby(['route', 'incident_type']).size()
df_new[df_new['incident_type'] >= 5].groupby(['route', 'incident_type'])['id'].size()
df_new[df_new['incident_type'] >= 5].groupby(['route', 'incident_type'])['id'].size().sort_values(ascending=False)
def decade_of_release(year):
if type(year) == int:
if year < 1900:
return "1800-1900"
if year >= 1900 and year < 1910:
return "1900-1910"
if year >= 1910 and year < 1920:
return "1910-1920"
if year >= 1920 and year < 1930:
return "1920-1930"
if year >= 1930 and year < 1940:
return "1930-1940"
if year >= 1940 and year < 1950:
return "1940-1950"
if year >= 1950 and year < 1960:
return "1950-1960"
if year >= 1960 and year < 1970:
return "1960-1970"
if year >= 1970 and year < 1980:
return "1970-1980"
if year >= 1980 and year < 1990:
return "1980-1990"
if year >= 1990 and year < 2000:
return "1990-2000"
if year >= 2000 and year < 2010:
return "2000-2010"
import matplotlib.pyplot as plt
df.plot()
plt.show()
df=pd.DataFrame({'userId':[1,1,1,1,1],
'movieId':[31,1029,1061,1129,1172],
'rating':[2.5,3,3,2,4],
'timestamp':[1260759144,1260759179,1260759182,1260759185,1260759205]
})
df
# write your code here
class Managers(Employee):
def __init__(self, name, seniority, awards):
super().__init__(name, seniority)
self.intlawards = awards
def check_if_it_is_time_for_upgrade(self):
# for each accreditation, increase the counter by 1
# for now we assume that all of the developers pass the accreditation
self.seniority += 1
# condition of promoting an employee from the presentation
if (self.seniority)+(self.intlawards*2) % 7 == 0:
self.grade_up()
# publication of the results
return self.publish_grade()
def add(a, b):
return a + b
add(1, 2)
def merge_arrays(arr1, arr2):
arr1.extend(arr2)
arr1.sort()
new_set = set(arr1)
new_list = list(new_set)
return new_list
def to_binary(n):
return bin(n).replace("0b", "")
df2['respiratory_rate'] = df2['respiratory_rate'].fillna(
df2.groupby('pulse')['respiratory_rate'].transform(round))
import pandas as pd
df = pd.read_csv("https://www.dropbox.com/s/jr9c7rwhi8hvuk7/performers.csv?dl=1")
df.sort_values(by=['time_on_chart','max'], ascending=False).head(20)
performer min max time_on_chart
9 "Weird Al" Yankovic 1 20 21
0 "Groove" Holmes 1 11 12
1 "Little" Jimmy Dickens 1 10 11
2 "Pookie" Hudson 1 1 2
import seaborn as sns
import matplotlib.pyplot as plt
def bar_graph(data):
plt.figure(figsize=(18, 6))
sns.barplot(data.index, data.values, alpha=0.8)
plt.title(str(data.name))
plt.ylabel('Count', fontsize=12)
plt.xlabel('Name', fontsize=12)
plt.show()
class Designer(Employee):
def __init__(self, name, seniority, awards=2):
super().__init__(name, seniority)
self.intlawards = awards
def check_if_it_is_time_for_upgrade(self):
self.seniority += 1
self.seniority += self.intlawards
if self.seniority % 7 == 0:
self.grade_up()
return self.publish_grade()
def fill_na(df, column_name):
corr = df.corr()[column_name]
new_value = corr.mean() * df[column_name].mean()
df[column_name] = df[column_name].fillna(new_value)
return df
fill_na(df2, column_name='rectal_temp')
def df_gdp_diff(df):
df19 = df_19[df_19['Year'] == 2019]
gdpdiff = pd.DataFrame({'Top1': df19[['Country or region', 'GDP per capita']]\
.sort_values(by='GDP per capita', ascending=False).head(20)[0:1],
'Top20': df19[['Country or region', 'GDP per capita']]\
.sort_values(by='GDP per capita', ascending=False).head(20)[19:20]})
return gdpdiff
df["operator"].apply(len).mean()
def df_incident_type(df):
df = df.groupby(['route', 'operator', 'group_name', 'incident_type'])[['incident_type']].count()
df = df.sort_values('incident_type', ascending=False)
df.reset_index(inplace=True)
df = df.drop_duplicates(subset=['route'], keep='first')
return df
class Designer(Employee):
def __init__(self, name, seniority, awards=2):
super().__init__(name, seniority)
self.intlawards = awards
def check_if_it_is_time_for_upgrade(self):
# for each accreditation, increase the counter by 1
# for now we assume that all designers are accredited
self.seniority += 1
# condition for promoting an employee from the presentation
if (self.seniority + self.intlawards) % 7 == 0:
self.grade_up()
# publication of the results
return self.publish_grade()
for i in winnums:
money += 1
print(money)
sns.barplot(x = 'hotel', y ='difference', hue = 'date', data = hotels_rev)
plt.show()
def expression_matter(a, b, c):
return max([a * b * c, a * (b + c), (a + b) * c, a + b + c])
class Ball():
def __init__(self, ball_type="regular"):
self.ball_type = ball_type
def replace_in_column(df, column, old_value, new_value):
df[column].replace(old_value, new_value, inplace=True)
import pandas as pd
df = pd.DataFrame([
[1, "Ksenia Rodionova", "2021-07-01", "Alpina", 1639.000000, "by_recommendation", 48, 3.0],
[2, "Ulyana Selezneva", "2021-07-01", "AquaMania", 930.000000, "by_airbnb.com", 97, 4.0],
[3, "Konstantin Prokhorov", "2021-07-01", "Breeze", 1057.720000, "agg_trivago.com", 173, 4.0],
[4, "Petrov Vladimir", "2021-07-01", "Moreon", 1403.000000, "agg_onlinetours.ru", 229, 4.0],
[5, "Arina Selivanova", "2021-07-01", "Alpina", 1639.000000, "agg_sutochno.ru", 63, 4.0],
[6
df4['loyal_profit'] = df4.apply(lambda x: (x.profit / x.ocup_rooms) if 'regular_customer' or 'by_recommendation' in x.how_find_us else None, axis=1)
def high_rating(df):
for i in df['rating']:
if i > 6.5:
print("The years with most high rating movies are",df['decade']) #used for
def groupby_cnt(df):
df = pd.DataFrame(df.groupby("userId")["rating"].count())
df.columns = ['ratings_cnt']
return df
def square_or_square_root(arr):
new_arr = []
for i in arr:
if int(i**(1/2)) == i**(1/2):
new_arr.append(int(i**(1/2)))
else:
new_arr.append(i**2)
return new_arr
def replace(stnums, students, replacing_num):
for index, item in enumerate(students):
if item[0] in stnums:
students[index][0] = replacing_num
return students
replace(stnums, students, replacing_num)
def plot_barchart(df):
df = df.sort_values('perc_of_5star', ascending = False).head(10)
return df.plot.barh(x = 'decade', y = 'perc_of_5star', title = '% 5-star ratings by decade');
plot_barchart(df)
import pandas as pd
performer = ['Glee Cast', 'Taylor Swift', 'Drake', 'YoungBoy Never Broke Again', 'Aretha Franklin', 'The Beatles']
hits = ['Somebody To Love', 'Friday', 'Loser Like Me', 'Baby', 'I Want You Back', 'Kacey Talk', 'Put It On Me', 'Dirty Iyanna', 'Lil Top', 'London Boy', 'Teardrops On My Guitar', 'Fifteen', 'Summer Sixteen', 'The Language', 'Weston Road Flow', 'Sgt. Pepper\'s Lonely Hearts Club Band/With A Little Help From My Friends']
chart_debut = [2009, 2008, 2016, 2020, 1967, 1978]
time_on_chart = [290, 14299, 7449, 1012, 3490, 3548]
consecutive_weeks = [47.0, 11880.0, 6441.0, 625.0, 2921.0, 2798.0]
decade = ['2000-2010', '2000-2010', '2010-2020', '2020-2030
import pandas as pd
def group(df):
df = df.pivot_table(index='rate_group', columns='grade', values='id', aggfunc=np.sum)
return df
df.plot(kind='box', subplots=True, layout=(4,2), sharex=False, sharey=False, fontsize=14, linewidth=3)
plt.show()
df['occupancy_rate'] = df.ocup_rooms / df.total_rooms
def unique(data: pd.DataFrame()) -> pd.DataFrame():
pass
function that: Modify this code to make a vertical bar graph instead of a pie chart (plotly.express library)
question6 = "How likely would you work for a company whose mission is not bringing social impact ?"
question6 = data[question6].value_counts()
label = question6.index
counts = question6.values
colors = ['gold','lightgreen']
fig = go.Figure(data=[go.Pie(labels=label, values=counts)])
fig.update_layout(title_text='How likely would you work for a company whose mission is not bringing social impact?')
fig.update_traces(hoverinfo='label+value', textinfo='percent', textfont_size=30,
marker=dict(colors=colors, line=dict(color='black', width=3)))
fig.show()
def chart_peak(df):
df.sort_values(by=['song', 'peak_position'], inplace=True)
df.drop_duplicates('song', keep='first', inplace=True)
return df
def max_key(dct):
return max(dct, key=lambda key: dct[key]['Value'])
max_key(dct)
def distinct(seq):
return list(dict.fromkeys(seq))
distinct([1, 2, 2, 3, 4, 4, 5])
def size(delta, sigsqr, conf):
return ((sigsqr * 1.645 ** 2) / (delta ** 2))
def switch_elements(arr):
return [arr[-1]] + arr[1:-1] + [arr[0]]
def quadratic(x1, x2):
return (1, -x1 - x2, x1 * x2)
def group_list(dct, gr):
group_list = []
for k, v in dct.items():
if gr in v:
group_list.append(' '.join(dct[k][0:3]))
group_list.sort()
for i, n in enumerate(group_list):
print('{}. {}'.format(i+1, n))
group_list(dct, 'BST161')
# output
1. A. García de Leon
2. A. Martínez Martínez
3. A. Romero de la Fuente
4. C. Ramírez de Cartagena
5. E. González Gómez
6. F. García León
7. H. Solís Ortíz
8. J. Carlos
9. J. Fernández
10. J. Muñoz Solís
11. L. González Gómez
def highlight_min_max(df):
df.style.highlight_max(axis=1,
props='color:white;\
font-weight:bold;\
background-color:green;')
df.style.highlight_min(axis=1,
props='color:white;\
font-weight:bold;\
background-color:brown;')
return df
df3.loc[df3.how_find_us.str.contains('yandex') == True, ['how_find_us']]
def occupancy_rate(total_rooms, ocup_rooms):
return ocup_rooms / total_rooms
df['occupancy_rate'] = occupancy_rate(df['total_rooms'], df['ocup_rooms'])
def create_bar_chart(x, y, title):
fig = px.bar(x=x, y=y, orientation='h')
fig.update_layout(title_text=title)
fig.show()
def five_star_decade_value_counts(df):
df = df.loc[df.rating == 5.0]
return df.decade.value_counts()
def add(a, b):
return a + b
add(1, 2)
def interval(n, mean, sig, conf):
h = 2*sig*math.sqrt(n)*norm.ppf(conf)/math.sqrt(n)
return h
def sort_df(df):
return df.sort_values(ascending=False)
sort_df(mean_c)
def round_floats(df, col):
return df[col].apply(np.round)
def bar_x_axis(df, column_name, title):
#prepare the data
question = df[column_name].value_counts()
label = question6.index
counts = question6.values
#create the figure
fig = px.bar(x=counts, y=label, orientation='h')
fig.update_layout(title_text=title)
fig.show()
bar_x_axis(df,'What is the most preferred working environment for you.','Какая рабочая среда для вас наиболее предпочтительна?')
def type_checker(variable, type):
if type(variable) == type:
return True
else:
return False
ratings = pd.read_csv("ratings.csv")
ratings.head()
def generate_unique_id(df):
df['unique_id'] = range(1, len(df) + 1)
def is_letter_in_Series(row):
if row["Series"] != row["Series"]:
return False
if any(x.isalpha() for x in row["Series"]):
return True
return False
import pandas as pd
dfs = pd.read_csv('https://raw.githubusercontent.com/coding-blocks-archives/ML-Noida-2019-June-Two/master/datasets/hot-100.csv', parse_dates=['chart_debut'])
dfs['chart_debut'] = dfs['chart_debut'].dt.year
dfs.head(10)
def join(rzd, auto):
# YOUR CODE HERE
joined = rzd.join(auto, how = 'outer')
return joined
def unique_id(df):
return [df['line_id'][i] for i in range(df.shape[0])]
unique_id(df)
def plot_num_of_hits(df):
df = df.sort_values("num_of_hits", ascending=False)
# make sure you have a matplotlib
# import matplotlib.pyplot as plt
plt.bar(df.performer, df.num_of_hits)
plt.show()
hotels_rev = df1[['date', 'hotel', 'revenue', 'av_revenue', 'difference', 'in_percent']].sort_values(by=['hotel', 'date'])
plt.bar(hotels_rev['date'], hotels_rev['av_revenue'], color='red')
plt.xlabel('Date')
plt.ylabel('Average revenues')
plt.title('Average revenues per hotel')
plt.xticks(rotation=90)
plt.show()
def move(directories, doc, shelf):
if doc in directories[shelf]:
return "ERROR VALUE ALREADY EXISTS"
elif doc not in directories:
return "ERROR NO SUCH VALUE"
elif shelf not in directories:
return "ERROR NO SUCH KEY"
else:
directories[shelf].append(doc)
for shelf in directories:
if doc in directories[shelf]:
del directories[shelf]
return directories
directories = {
'1': ['2207 876234', '11-2'],
'2': ['10006'],
'3': []
}
doc = '11-2'
shelf = '3'
move(directories, doc, shelf)
# Use index as a unique identifier
df.index
# Use a column as the unique identifier
df['year'].values
def total_ingridients(cook_book):
dish = input("Enter dish: ")
ingridients = cook_book[dish]
for i in ingridients:
for value in i.values():
print(value)
total_ingridients(cook_book)
def func_name(group_number, dct):
students = []
for key, value in dct.items():
if value[4] == group_number:
students.append(value)
students.sort(key=lambda x: x[0])
for index, student in enumerate(students):
print(index, student[0], student[1], student[2])
func_name('BST162', dct)
def merge_cols(df, col_list):
return df.groupby(col_list)[['source_type']].apply(lambda x: tuple(x)).reset_index(name='source_type')
merge_cols(df, ['traffic_source', 'region'])
def get_sample_size(z=1.96, conf=0.95, sigsqr=1, delta=0.5):
return (z*z*sigsqr)/(delta*delta)
hotels_rev.boxplot(column='av_revenue', by='hotel')
def how_much_water(water, load, clothes):
if clothes == load:
return water
if clothes < load:
return water
return how_much_water(water, load, clothes - 1) * 1.1
# неверно
def fig(x=counts, y=label):
fig = px.bar(x=counts, y=label, orientation='h')
fig.update_layout(title_text='Какая рабочая среда для вас наиболее предпочтительна?')
fig.show()
def interval(n, mean, sig, conf):
h = stats.norm.interval(conf, loc=mean, scale=sig / np.sqrt(n))[1] - stats.norm.interval(conf, loc=mean, scale=sig / np.sqrt(n))[0]
return round(h)
def move(directories, v, k):
if k not in directories:
print('ERROR NO SUCH KEY')
return directories
elif v not in directories[k]:
print('ERROR NO SUCH VALUE')
return directories
else:
directories[k].append(v)
for key, value in directories.items():
if v in value:
value.remove(v)
return directories
directories = {
'1': ['2207 876234', '11-2'],
'2': ['10006'],
'3': []
}
move(directories, '11-2', '3')
df[df[['name']].duplicated(keep=False)]
df['name'].sort_values()
def html_escape(text):
text = text.replace('<', '<')
text = text.replace('>', '>')
text = text.replace('"', '"')
text = text.replace('&', '&')
return text
def group_lst(num):
for i in dct[num]:
print(i)
return
import math
import scipy
from scipy import stats
def sample_size(delta, sigsqr, conf):
z = stats.norm.ppf(conf)
n = math.ceil((2*z*z*sigsqr)/(delta**2))
return n
sample_size(10, 100, 0.95)
fig.update_layout(title_text='Какая рабочая среда для вас наиболее предпочтительна?')
fig.show()
import math
class Point(object):
def __init__(self, x=0, y=0):
self.x = x
self.y = y
# TODO Write a function calculating distance between Point a and Point b.
def distance(a, b):
c = math.sqrt((a.x - b.x)**2 + (a.y - b.y)**2)
return c
a = Point(1,1)
b = Point(1,2)
print(distance(a,b))
df_new['unique_id'] = pd.Series(range(1, df_new.shape[0]+1))
df_new.head()
sl = [0.067,0.067,0.067,0.067,0.067,0.067,0.067,0.067,0.067,0.067]
sw = [0.050,0.050,0.050,0.050,0.050,0.050,0.050,0.050,0.050,0.050]
scaled_data = { }
# Add code here
df = pd.DataFrame(scaled_data, columns=['sl', 'sw'])
def decade(df):
df = df[df.rating == 5.0]
return df.decade.value_counts()
from sklearn.metrics import f1_score
y_pred = lda.predict(X_test)
f1_score(y_test, y_pred)
def year_leaders(df):
return df.groupby('chart_debut')['num_of_hits'].max()
def group_movies(df):
def is_month_end(date):
if date[-2:] == '31':
return 1
else:
return 0
is_month_end(df['Date'])
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
le = LabelEncoder()
le.fit(data_class)
data_class = le.transform(data_class)
X_train, X_test, y_train, y_test = train_test_split(data, data_class, random_state=42)
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro') # average='macro'
print('F1:', f1)
#confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_
def add(a, b):
return a + b
add(1, 2)
def how_much_water(water, load, clothes):
return water * (1.1 ** (clothes - load))
how_much_water(5, 10, 14)
region direct yandex google
0 Russia 1 4 0
1 Germany 0 1 0
2 USA 0 0 1
3 Italy 0 1 0
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'year': [2012, 2012, 2013, 2014, 2014],
'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
from math import sqrt
def distance(a, b):
x_diff_sq = (a.x - b.x)**2
y_diff_sq = (a.y - b.y)**2
return round(sqrt(x_diff_sq + y_diff_sq), 2)
from math import sqrt
def distance(a, b):
return sqrt((a.x - b.x)**2 + (a.y - b.y)**2)
def replace_student(lst):
result = []
for student in lst:
if student[0] == student[-2]:
student[-2] = '9090'
result.append(student)
return result
print(replace_student(lst))
df['is_loyal'] = df.duplicated(subset='name', keep=False).apply(lambda x: 'True' if x else 'False')
out = (x*5 for x in y)
y = {'Marlboro': [3, 13, 6, 66, 13, 7, 13]}
for el in y.get('Marlboro'):
el * portions
print(el)
import pandas as pd
def get_dataframe(sl, sw):
df = pd.DataFrame({
'sl': sl,
'sw': sw
})
return df
sl = [1, 2, 3, 4, 5]
sw = [1, 2, 3, 4, 5]
df = get_dataframe(sl, sw)
df
def find_index(lst, ind):
return lst[ind % len(lst)]
find_index(["a", "b", "c", "d"], 1)
df = pd.DataFrame({'chart_debut': ['2012', '2012', '2012', '2014', '2017'], 'num_of_hits': [1,2,3,4,5]})
def year_leaders(df):
return df.groupby('chart_debut').max()
year_leaders(df)
if word.find(first) + word.find(second) == -2:
s = word.find(first)
if word[s + 1] == second:
return True
else:
return False
else:
return False
goes_after("world", "o", "r")
def prepare_dish(dish, portions):
for ingridient in dish:
ingridient['quantity'] = ingridient['quantity'] * portions
return dish
prepare_dish(dish, portions)
def year_leaders_all(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
return dfp_copy
year_leaders_all(dfp)
df3[df3.how_find_us.str.contains('yandex').drop_duplicates(keep=False)
class Solution(object):
def main(self):
print("Hello World!")
Solution.main("parameter1","parameter2")
import math
class Point:
def __init__(self, x, y):
self.x = x
self.y = y
def distance(a, b):
return math.sqrt((a.x - b.x)**2 + (a.y - b.y)**2)
def move_doc(directories, doc, shelf):
if shelf in directories:
if doc not in directories[shelf]:
print('No such value')
else:
for value in directories.values():
if doc in value:
value.remove(doc)
else:
print('No such key')
directories[shelf].append(doc)
return directories
directories = {
'1': ['2207 876234', '11-2', '5455 028765'],
'2': ['10006', '5400 028765', '5455 002299'],
'3': []
}
doc = '11-2'
shelf = '3'
move_doc(directories, doc, shelf)
def how_much_water(L,X,N):
return L * (1+0.1)**(N-X)
how_much_water(5, 10, 14)
def _if(bool, func1, func2):
if bool:
func1()
else:
func2()
def truthy():
print("True")
def falsey():
print("False")
_if(True, truthy, falsey)
plt.xticks(ks)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 1 16:34:47 2020
@author: jordan
"""
def ends77(x):
return x % 100 == 77
def ends7(x):
return x % 10 == 7
def ends00(x):
return x % 100 == 0
def ends0(x):
return x % 10 == 0
def dropdollar(x):
return x % 100 == 0 or x % 100 == 7
def dropdollars(x):
return x % 100 == 0 or x % 100 == 7
money = 0
trials = 100000
for i in range(trials):
num = np.random.randint(1, 1000)
if num == 777: money += 200; successes += 1
elif num == 999: money += 100; successes += 1
elif num == 555: money += 50; successes += 1
elif num == 333: money += 15;
def max_in_dictionary(d):
max_key = max(d, key=lambda key: d[key]['Value'])
return (max_key, d[max_key])
max_in_dictionary(rates)
def am_i_wilson(n):
from math import factorial
return factorial(n-1)+1 == n*n*factorial(n-2)
# Write your code here
import pandas
users = pandas.read_csv('ml-100k/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
ratings = pandas.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
avg_lifetime = ratings.groupby('user_id')['timestamp'].agg(['max','min']).mean(axis=1).mean()
print(avg_lifetime)
def to_csv_string(array):
return '\n'.join([','.join([str(j) for j in i]) for i in array])
to_csv_string([[ 0, 1, 2, 3, 4 ],
[ 10,11,12,13,14 ],
[ 20,21,22,23,24 ],
[ 30,31,32,33,34 ]])
def rich_people(df, a, b):
return df[(df.annual_inc >= a) & (df.annual_inc <= b)]
rich_people(df, 100000, 1000000)
def hot_years_create(df_ru, min_temp):
df_ru_hot = df_ru.groupby(['year'])['av_temp'].mean().reset_index()
return df_ru_hot[df_ru_hot['av_temp'] > min_temp]
hot_years_create(df_ru, 15)
def how_much_water(water, load, clothes):
if load >= clothes:
return water
else:
return water * 1.1 ** (clothes - load)
how_much_water(5, 10, 14)
def to_binary(n):
return bin(n)
def round(a, b):
return a + b
add(1, 2.5)
def remove_duplicate(df):
# remove duplicates in column name
return df[df.duplicated(subset="name", keep=False)]
def interval(n, mean, sig, conf):
h = sig / (n ** 0.5) * norm.ppf(conf)
return int(h)
def sum_matrix(N):
my_matrix = np.diagonal(np.linspace(N, 0, N))
return my_matrix.sum()
print(sum_matrix(5))
print(sum_matrix(10))
print(sum_matrix(15))
def math(a):
if type(a) == 'str':
return "Error"
else:
return (a * 50) + 6
math(5)
def is_month_end(date):
if date[-2:] == '31':
return 1
else:
return 0
df['is_month_end'] = df['Date'].apply(is_month_end)
df.head()
def columns_to_rows(dataframe):
new_dataframe = pd.DataFrame(dataframe.loc[0]).T
return new_dataframe
print(', '.join(map(str, range(1, len(group_list(dct, 'BST161')) + 1))))
def max_key(dct):
max_v = 0
max_k = ' '
for key in dct:
if dct[key]['Value'] > max_v:
max_v = dct[key]['Value']
max_k = key
return max_k
dct = {'a': {'Value': 1, 'Other': 2}, 'b': {'Value': 5, 'Other': 4}, 'c': {'Value': 3, 'Other': 4}}
max_key(dct)
def filter_df(df, column):
return df[df[column].duplicated(keep=False)].sort_values(column)
df = pd.DataFrame({'name': ['Ksenia Rodionova', 'Ulyana Selezneva', 'Konstantin Prokhorov',
'Petrov Vladimir', 'Arina Selivanova', 'Ksenia Rodionova'],
'profit_per_room': [1639.000000, 930.000000, 1057.720000, 1403.000000, 1639.000000, 1639.000000]})
filter_df(df, 'name')
df = pd.DataFrame([['Ksenia Rodionova', 'Artur Petrov', 'Ivan Sidorov', 'Ksenia Rodionova']]).T
df.columns = ['name']
df.drop_duplicates(keep = 'first', inplace = True)
df.sort_values(by = 'name', ascending = True)
df_type['App'].groupby(df_type['Type']).sum().plot(kind='pie',
figsize=(5, 6),
autopct='%1.1f%%', # add in percentages
startangle=90, # start angle 90° (Africa)
shadow=True, # add shadow
)
plt.title('Pie chart of the Repartition between Free and Paid Apps')
plt.axis('equal') # Sets the pie chart to look like a circle.
plt.show()
def find_non_numbers(df, col):
for i in df[col]:
if type(i) == str:
i = np.nan
find_non_numbers(df, 'loan_amnt')
def ratings(x):
if x <= 2.0:
return 'Low'
elif x <= 4.0:
return 'Average'
else:
return 'High'
df['rating'] = df['rating'].apply(ratings)
df.head()
def transpose(matrix):
return list(map(list, zip(*matrix)))
transpose(matrix)
sl = [0.8,0.9,0.9,1.0,1.1,1.1,1.2,1.4,1.4,1.5,1.6,1.6,1.7,1.7,1.8,1.8,1.8,1.9,1.9,2.0,2.0,2.1,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.3,2.3,2.3,2.3,2.3,2.4,2.4,2.4,2.4,2.4,2.4,2.4,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.
def chart_to_hits(df):
df['hits'] = df.groupby('performer')['song'].apply(lambda x: ','.join(x)).reset_index()['song']
df.drop_duplicates(subset = 'performer', inplace = True)
return df
def divide_decade(df, decade):
decade_df = df[df['chart_debut'] // 10 == decade // 10]
return decade_df
arr = [2,1,10,5]
def sum(arr):
i = 0
res = 0
while i < len(arr):
j = i + 1
while j < len(arr):
res += arr[i] + arr[j]
j += 1
i += 1
return res
sum(arr)
import numpy as np
from scipy import stats
def seed(seed):
np.random.seed(seed)
n = np.random.choice(range(10,26))
mean = np.random.choice(range(120,141))
sig = np.random.choice(range(10,21))
conf = np.random.choice([0.90, 0.95, 0.98, 0.99, 0.999])
return n, mean, sig, conf
def interval(n, mean, sig, conf):
h = sig * stats.t.ppf((1+conf)/2, n-1) / np.sqrt(n)
return np.round(h)
print(interval(*seed(12)))
print(interval(*seed(45)))
print(interval(*seed(7)))
The result of the function should be three values: 12, 28, 21
import math
def get_sample_size(error, variance, confidence):
sqrt_variance = math.sqrt(variance)
return math.ceil((error * math.sqrt(2 * (1 - confidence)) * sqrt_variance) / (error * error))
get_sample_size(0.02, 0.05, 0.95)
df2['region'] = df2['keyword'].apply(geo_class)
def replace_vowels(vowels):
vowel_codes = [97, 101, 105, 111, 117] # a, e, i, o, u
result = []
for vowel in vowels:
if isinstance(vowel, str):
result.append(vowel)
elif vowel in vowel_codes:
result.append(chr(vowel))
else:
result.append(vowel)
return result
inp = [118, "u",120,121,"u",98,122,"a",120,106,104,116,113,114,113,120,106 ]
print(replace_vowels(inp))
def object_finder(row):
if type(row['values']) == str or type(row['values']) == list:
return row['values']
else:
return None
df['object'] = df.apply(object_finder, axis=1)
sns.barplot(data=top20,
x='Score',
y='Country or region',
color='#5ed14f')
plt.xlim(6,8)
# 1. how much water does my washing machine use
# 2. how much water does my clothes need for washing
# 1. 5 litres
# 2. (1.1 ^ (14 - 10)) * 5
# 3. 5 * 1.1 ^ 4
# 4. 5 * 1.1 * 1.1 * 1.1 * 1.1 = 7.4074
def how_much_water(water, load, clothes):
return (1.1 ** (clothes - load)) * water
print(how_much_water(5, 10, 14))
def generate_unique_id(dataframe):
dataframe.index += 1
return dataframe
generate_unique_id(df_new)
def respiratory_rate_function(x):
if x < 1:
return 1
elif x > 4:
return 4
else:
return round(x)
df2['respiratory_rate'] = df2['respiratory_rate'].apply(respiratory_rate_function)
def sort_by_max(x):
return x.sort_values(by='max', ascending=True)
def most_5(data):
data.groupby('movieId')['rating'].count()
return data.sort_values(by = 'rating', ascending = False).head(1)
most_5(df)
def find_difference(a, b):
return abs(reduce(lambda x,y: x*y, a) - reduce(lambda x,y: x*y, b))
def avg_date_by_operator(df_new):
# YOUR CODE HERE
# raise NotImplementedError()
return df_new.groupby("operator").agg("date").mean()
df_2015 = pd.read_csv("bus_trucks_2015.csv")
avg_date_by_operator(df_2015)
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({'Day':['Tuesday','Wednesday','Thursday','Friday','Saturday','Monday','Sunday'],
'Value':[358114,345393,323337,293805,292016,278905,273823]})
df.plot.barh()
df[df.duplicated(subset=['name'], keep= False)][['name']]
import pandas as pd
# function that: calculate the average temperature in countries
def average_temp_of_country(df):
return df.groupby('country')['av_temp'].mean()
# function that: build a list of the 20 coldest countries in ascending av_temp order
def coldest_20_countries(df):
return df.groupby('country')['av_temp'].mean().sort_values()[:20]
from math import sqrt
def get_sample_size(delta, conf, sigsqr):
z = 1.96 # z-score for 95% confidence level
return int(sigsqr * z**2 / delta**2)
get_sample_size(100, 0.95, 2000) # => 477
my_regex = re.compile(r"[a-z][A-Z][0-9]{4,16}")
def dataframe(sl, sw):
data = {'sl': sl, 'sw': sw}
df = pd.DataFrame(data)
return df
sl = [[-0.90068117], [-1.14301691], [-1.38535265], [-1.50652052], [-1.02184904], [-0.53717756], [-1.50652052], [-1.02184904], [-1.74885626], [-1.14301691]]
sw = [[3.5], [2.5], [2.4], [1.5], [3.5], [2.2], [2.1], [1.5], [1.1], [1.3]]
dataframe(sl, sw)
def my_function(x1, x2):
plt.legend(loc='upper left')
plt.title('Сравнение распределений с собственным жильем и без')
x1.plot(kind='hist',
alpha=0.5,
bins=6,
density=True)
x2.plot(kind='hist',
alpha=0.5,
bins=6,
density=True)
return x1, x2
my_function(x1, x2)
def replace_non_numbers(df, column):
df[column] = pd.to_numeric(df[column], errors = 'coerce')
import pandas as pd
def divide_hotels(df):
big_hotels = []
medium_hotels = []
small_hotels = []
for item in df['total_rooms']:
if item > 30:
big_hotels.append(df['profit'])
elif item > 20:
medium_hotels.append(df['profit'])
elif item > 10:
small_hotels.append(df['profit'])
df['big_hotels'] = big_hotels
df['medium_hotels'] = medium_hotels
df['small_hotels'] = small_hotels
return df
df2['respiratory_rate'] = df2['respiratory_rate'].fillna(
df2.groupby('pulse')['respiratory_rate'].transform('median'))
def my_f(col_name, df):
# get the rows for which the column is NaN
df = df[ np.isnan(df[col_name]) ]
# find the rows that have similar pulse
df_pulse = df[ abs( (df.pulse - df.pulse.shift(1)) / df.pulse ) <= 0.2 ]
# find the rows that have similar respiratory_rate
df_respiratory_rate = df[ abs( (df.respiratory_rate - df.respiratory_rate.shift(1)) / df.respiratory_rate ) <= 0.2 ]
# merge the dataframes
df_merged = pd.concat( [df_pulse, df_respiratory_rate] )
# get rid of duplicates
df_merged = df_merged.drop_duplicates()
# return the rows
return df_merged
def replace(item, list1, replacing_number):
for i in range(0,len(list1)):
for j in range(0,len(list1[i])):
if list1[i][j] == item:
list1[i][j] = replacing_number
replace('4004', students, '9090')
def filter_coldest(df, year_threshold):
return df[df['year'] > year_threshold]\
.groupby('country')['av_temp'].mean()\
.sort_values()[:20]
filter_coldest(df, 1980)
from scipy import stats
import numpy as np
sync = [85.1, 83.8, 69.9, 82.1, 84.4, 80.4, 78.1, 88.4, 77., 91.5, 76.7, 86.6, 91.8, 73.3, 83.9, 76.7, 85.8, 89.6, 91.7, 87.2, 79., 85.3]
asyncr = [89.8, 81.6, 87.4, 81., 66.9, 72.5, 78.4, 68.5, 78.3, 62.6, 73.7, 77.7, 63., 77.5]
stats.ttest_ind(sync, asyncr, equal_var = False)
np.var(sync), np.var(asyncr)
def find_non_numbers(df, column):
return df[~df[column].astype(str).str.isnumeric()]
df = pd.read_csv('ratings.csv')
df.head()
df.columns
df.groupby('userId').size()
df.groupby('userId').size().index
df.groupby('userId').size().values
#df.groupby('userId').size().values > 100
df.groupby('userId').size().values[df.groupby('userId').size().values > 100]
df.groupby('userId').size().index[df.groupby('userId').size().values > 100]
df[df['userId'] == 1]
df[df['userId'] == 1].timestamp
df[df['userId'] == 1].timestamp.diff()
df[df['userId'] == 1].timestamp.diff().min()
df[df['userId'] == 1].timestamp.diff().max()
df[df['userId'] == 1].timestamp.diff().max() - df[df['userId'] == 1].timestamp.diff().min()
def get_lif
df = pd.DataFrame({'date': ['1743-12-01', '1744-01-01', '1744-02-01', '1744-03-01', '1744-08-01'],
'av_temp': [0, 10, 20, 30, 40],
'deviations': [0, 10, 20, 30, 40],
'country': ['Åland', 'Åland', 'Åland', 'Åland', 'Åland']
})
# Find the rows in the dataview where the values in the name column are duplicated.
df[df.duplicated(subset='name', keep=False)]
# Create a new dataview in which the first row of the duplicate and all subsequent ones will be added. Sort the name column in ascending order
df.sort_values("name").drop_duplicates(subset="name", keep='first')
from collections import Counter
def most_incident(df, column):
# Count incident_type by route
incident_id = Counter(df[column])
# Return key of the most incident_type
return incident_id.most_common(1)[0][0]
most_incident(df_new, 'route')
# Import necessary libraries
from math import sqrt, erf
def interval(n, mean, sig, conf):
z = erf(conf + 0.5)
h = z * sig / sqrt(n)
return int(h)
# The result of the function should be three values: 12, 28, 21
interval(100, 100, 10, 0.95)
interval(100, 100, 10, 0.99)
interval(100, 100, 10, 0.995)
def get_id(df):
return df.id
get_id(df)
def to_int64(df):
for i in df.columns:
df[i] = df[i].astype('int64')
return df
def get_most_incidents_route(df):
# count incident_type by the route
df_count = df.groupby('route').count()
# sort the df_count by incident_type and get the first row
df_count.sort_values(by='incident_type', inplace=True, ascending=False)
return df_count.iloc[0]
get_most_incidents_route(df_tfl)