CS301Proj

Publish Date: 2019-10-31

Author: Naixin Zhang

Word Count: 1.4k

Read Times: 8 Min

Read Count:

CS301_Proj6

Introduction

This project is a wine connoisseurs’ delight! Data Science can help us understand people’s drinking habits around the world. For example, take a look at Mona Chalabi’s analysis here: Where Do People Drink The Most Beer, Wine And Spirits?

For our part, we will be exploring a modified subset (the first 1501 rows) of the Kaggle wine reviews dataset; you will be using various string manipulation functions that come with Python as well as rolling some of your own to solve the problems posed. Happy coding, and remember the Ballmer Peak is nothing but a myth!

# project: p6
# submitter: naixinzhang
# partner: none

import csv

# copied from https://automatetheboringstuff.com/chapter14/
def process_csv(filename):
    exampleFile = open(filename, encoding="utf-8")
    exampleReader = csv.reader(exampleFile)
    exampleData = list(exampleReader)
    exampleFile.close()
    return exampleData

# use process_csv to pull out the header and data rows
csv_rows = process_csv("wine.csv")
csv_header = csv_rows[0]
csv_data = csv_rows[1:]

def cell(row_idx, col_name):
    col_idx = csv_header.index(col_name)
    val = csv_data[row_idx][col_idx]
    if val == "":
        return None
    elif col_name == 'points':
        return int(val)
    elif col_name == 'price':
        return float(val)
    else:
        return val

#q1 which country names are listed in the country column of the dataset?
def country_names():
    a = set()
    for i in range(len(csv_rows) - 1):
        val = cell(i,'country')
        if val is not None:
            a.add(val)
    return list(a)
country_names()

['South Africa',
 'Chile',
 'France',
 'Slovenia',
 'Morocco',
 'New Zealand',
 'Germany',
 'Argentina',
 'Bulgaria',
 'Canada',
 'Austria',
 'Romania',
 'Spain',
 'Israel',
 'India',
 'Croatia',
 'US',
 'Moldova',
 'Greece',
 'Australia',
 'Italy',
 'Hungary',
 'Portugal']

#q2 what is the average points (rating) of wine?
def ave_points():
    sum = 0
    for i in range(len(csv_rows)-1):
        val = cell(i, 'points')
        if val is not None:
            sum = sum+val
    return sum / (len(csv_rows)-1)
ave_points()

89.65489673550967

#q3 List all wineries which produce wine in South Africa?
def wineries_south_africa():
    wineries_south = set()
    for i in range(len(csv_rows) - 1):
        if cell(i,'country') == 'South Africa' and cell(i,'winery') is not None:
            wineries_south.add(cell(i,'winery'))
    return list(wineries_south)
wineries_south_africa()

['Fat Barrel',
 'KWV',
 'Vergelegen',
 'Bouchard Finlayson',
 'Graham Beck',
 'Noble Hill',
 'Long Neck',
 'Waterkloof',
 'Essay',
 'Robertson Winery',
 'Neil Ellis']

#q4 which wine varieties contain the phrase "caramelized" in the description?
def q4():
    variety_cara = set()
    for i in range(len(csv_rows) - 1):
        if 'caramelized' in cell(i,'description') or 'Caramelized' in cell(i,'description'):
            variety_cara.add(cell(i,'variety'))
    return list(variety_cara)
q4()

['Chardonnay', 'White Blend', 'Rhône-style White Blend', 'Syrah']

#q5 which wine varieties contain the phrase "lemon-lime soda" in the description?
def q5():
    variety_le = set()
    for i in range(len(csv_rows) - 1):
        if 'lemon-lime soda' in cell(i,'description') or 'Lemon-lime soda' in cell(i,'description'):
            variety_le.add(cell(i,'variety'))
    return list(variety_le)
q5()

['Chardonnay', 'Pinot Grigio', 'Sparkling Blend']

#q6 which wine varieties contain the phrase "cherry-berry" in the description?
def q6():
    variety_ch = set()
    for i in range(len(csv_rows) - 1):
        if 'cherry-berry' in cell(i,'description') or 'Cherry-berry' in cell(i,'description'):
            variety_ch.add(cell(i,'variety'))
    return list(variety_ch)
q6()

['Rosé', 'Nebbiolo', 'Cabernet Sauvignon']


def areAnagram(str1, str2):  
    # Remove spaces of str1 and str2
    str1 = str1.replace(" ","")
    str2 = str2.replace(" ","")
    # Converts all uppercase characters to lowercase
    str1 = str1.lower()
    str2 = str2.lower()
    # Get lengths of both strings  
    n1 = len(str1)  
    n2 = len(str2)  
    str1 =list(str1)
    str2 = list(str2)

    # If lenght of both strings is not same, then  
    # they cannot be anagram  
    if n1 != n2:  
        return 0

    # Sort both strings  
    str1 = sorted(str1) 
    str2 = sorted(str2) 

    # Compare sorted strings  
    for i in range(0, n1):  
        if str1[i] != str2[i]:  
            return 0

    return 1

def check(str3):
    variety_anagrams = set()
    for i in range(len(csv_rows) - 1):
        if areAnagram(cell(i,'variety'),str3) == 1 :
            variety_anagrams.add(cell(i,'variety'))
    return variety_anagrams

#q7 which wine varieties are anagrams of the phrase "antibus governance"?
def q7():
    return list(check('antibus governance'))
q7()

['Cabernet Sauvignon']

#q8 which wine varieties are anagrams of the phrase "Banned Petrol Mill".
def q8():
    return list(check('Banned Petrol Mill'))
q8()

['Tempranillo Blend']

def find_highest(col, target, place):
    h_curr = 0
    result = set()
    for i in range(len(csv_rows) - 1):
        if place == cell(i, 'country'):
            if cell(i, col) is not None:
                if cell(i,col) == h_curr:
                    result.add(cell(i, target))
                if cell(i, col) > h_curr:
                    h_curr = cell(i, col)
                    result = set()
                    result.add(cell(i, target))
    return result

#q9 which winery produces the highest-priced wine in "US"?
def q9():
    return list(find_highest('price', 'winery', 'US'))
q9()

['Hall']

#q10 what is the highest-rated wine variety made in "France"?
def q10():
    return list(find_highest('points', 'variety','France'))
q10()

['Tannat', 'Malbec', 'Provence red blend']

def get_ratio(winery):
    sum = 0
    j = 0
    for i in range(len(csv_rows) - 1):
        if cell(i,'winery') == winery and cell(i,'points') != "" and cell(i,'price')!="":
            sum = sum + cell(i,'points') / cell(i,'price')
            j = j+1
    return sum/j

#q11 what is the average points-per-dollar (PPD) ratio of the "Ponzi" winery?
def q11():
    return get_ratio('Ponzi')
q11()

1.288074888074888

#q12 what is the average PPD of the "Blue Farm" winery?
def q12():
    return get_ratio('Blue Farm')
q12()

1.3628968253968254

def find_lowest_average_PPD(place):
    lowest_curr = float("inf")
    result = set()
    for i in range(len(csv_rows) - 1):
        if place == cell(i, 'country'):
            if cell(i, 'points') is not None and cell(i, 'price') is not None:
                ratio_curr = get_ratio(cell(i,'winery'))
                if ratio_curr == lowest_curr:
                    result.add(cell(i, 'winery'))
                if ratio_curr < lowest_curr:
                    lowest_curr = ratio_curr
                    result = set()
                    result.add(cell(i, 'winery'))
    return result

#q13 which winery in New Zealand has the lowest average PPD?
def q13():
    return list(find_lowest_average_PPD('New Zealand'))[0]
q13()

'Kumeu River'

#q14: which winery in Australia has the lowest average PPD?
def q14():
    return list(find_lowest_average_PPD('Australia'))
q14()

['Dalrymple', "D'Arenberg"]

#q15 which winery in Canada has the lowest average PPD?
def q15():
    return list(find_lowest_average_PPD('Canada'))[0]
q15()

'Burrowing Owl'

def get_varieties(winery):
    res = set()
    for i in range(len(csv_rows) - 1):
        if cell(i,'winery') == winery:
            res.add(cell(i, 'variety'))
    return list(res)

#q16 which wine varieties are produced by the "Global Wines" winery?
def q16():
    return list(get_varieties('Global Wines'))
q16()

['Portuguese Sparkling', 'Portuguese Red', 'Touriga Nacional']

#q17 which wine varieties are produced by the "Quinta Nova de Nossa Senhora do Carmo" winery?
def q17():
    return list(get_varieties('Quinta Nova de Nossa Senhora do Carmo'))
q17()

['Portuguese White', 'Portuguese Rosé', 'Portuguese Red']

#q18 what percentage of the varieties produced by "Global Wines" are also produced by "Quinta Nova de Nossa Senhora do Carmo"?
def q18():
    temp_1 = set()
    temp_2 = set()
    for i in get_varieties('Global Wines'):
        temp_1.add(i)
    for j in get_varieties('Quinta Nova de Nossa Senhora do Carmo'):
        temp_2.add(j)
    return len(temp_1 & temp_2)/len(temp_1) * 100
q18()

33.33333333333333

def find_cheapest(place):
    cheapest_curr = float("inf")
    for i in range(len(csv_rows) - 1):
        if place == cell(i, 'country'):
            if cell(i, 'price') is not None:
                if cell(i, 'price') < cheapest_curr:
                    cheapest_curr = cell(i, 'price')
    return cheapest_curr

#q19 what is the cost of cheapest wine that you could find in US?
def q19():
    return find_cheapest('US')
q19()

10.0

def find_costliest(place):
    costliest_curr = 0
    for i in range(len(csv_rows) - 1):
        if place == cell(i, 'country'):
            if cell(i, 'price') is not None:
                if cell(i, 'price') > costliest_curr:
                    costliest_curr = cell(i, 'price')
    return costliest_curr

#q20 What is the total cost of buying the two bottles?
def q20():
    return find_cheapest('US') + find_costliest('US')
q20()

335.0