Data Science Portfolio

ThunderLeague WebScraper

Starcraft 2 is a 1v1 competitive multiplayer game in which you select one of three factions, called races, that do battle on a variety of battlefields, called maps. Each map has a different look, feel, and set of features that might make it stronger against one race or another.

When designing my tournament, I wanted to ensure that I could have as many interesting maps see play as possible. The thing is, some maps have a faction winrate balance close to 50-50 for two factions, but are complete imbalanced for the other faction. My solution was to make separate map pools for each faction combination.

Liquipedia.net is a website devoted to recording esports results, and includes matchup winrate information for every map that has seen play on the competitive 1v1 ladder. I created a webscraping script (run in a jupyter notebook) that found all of the map names, and, after some minor reformatting, iterated through each of them - extracting from the corresponding webpages the pertinent winrate information for each matchup, as well as map size. It also drilled down into the image of the map to get the highest resolution image, and download that to my computer.

Next, check out the Winrate Analysis / Map Selection Functions .

App.py File


        #!/usr/bin/env python
        # coding: utf-8
        
        # In[1]:
        
        
        import random
        import splinter
        from splinter import Browser
        import urllib.request
        import pandas as pd
        from collections import OrderedDict
        import time
        from bs4 import BeautifulSoup
        import numpy as np
        
        
        #  Obtain Full List of Map Names to Loop Over 
        
        # In[3]:
        
        
        url1 = 'https://liquipedia.net/starcraft2/Maps/Ladder_Maps/Legacy_of_the_Void'
        url2 = 'https://liquipedia.net/starcraft2/Maps/Ladder_Maps/Heart_of_the_Swarm'
        url3 = 'https://liquipedia.net/starcraft2/Maps/Ladder_Maps/Wings_of_Liberty'
        tables1 = pd.read_html(url1)
        tables2 = pd.read_html(url2)
        tables3 = pd.read_html(url3)
        
        
        # In[7]:
        
        
        LotV_ladder_maps = tables1[7]
        LotV_ladder_maps
        LotV_ladder_maps['Name']
        # LotV_ladder_maps['Name']
        
        
        # In[8]:
        
        
        LotV_ladder_maps = tables1[7]['Name'].tolist()
        HotS_ladder_maps = tables2[10]['Name'].tolist()
        WoL_ladder_maps = tables3[10]['Name'].tolist()
        
        
        # In[9]:
        
        
        WoL_ladder_maps
        
        
        # In[10]:
        
        
        All_ladder_maps = LotV_ladder_maps + HotS_ladder_maps + WoL_ladder_maps
        len(All_ladder_maps)
        
        
        # In[11]:
        
        
        All_ladder_maps_unique = list(OrderedDict.fromkeys(All_ladder_maps))
        len(All_ladder_maps_unique)
        
        
        # In[12]:
        
        
        All_ladder_maps_clean = [item.replace(" ", "_") for item in All_ladder_maps_unique]
        All_ladder_maps_clean[-5:]
        
        
        #  Loop Through Each Map Page on Liquipedia, collecting Matchup Statistics and Saving Map Images 
        
        # In[13]:
        
        
        executable_path = {'executable_path': 'C:\ChromeSafe\chromedriver.exe'}
        browser = Browser('chrome', **executable_path, headless=False)
        
        
        # In[15]:
        
        
        partial_url = "https://liquipedia.net/starcraft2/"
        mapdictlist = []
        mapdict = {}
        
        for item in All_ladder_maps_clean[10:]:
            full_url = partial_url + item
            browser.visit(full_url)
            time.sleep(1)
            html = browser.html
            soup = BeautifulSoup(html, 'html.parser')
            mapdictb = []
            mapdict[f"{item}"] = mapdictb
            try:
                mapdictb.append(int(soup.find("td", class_='stats-tvz-1').get_text()))
            except:    
                mapdictb.append('NOPE')
            try:    
                mapdictb.append(soup.find("td", class_='stats-tvz-4').get_text())
            except:
                mapdictb.append('NOPE')
            try:
                mapdictb.append(int(soup.find("td", class_='stats-zvp-1').get_text()))
            except:    
                mapdictb.append('NOPE')
            try:    
                mapdictb.append(soup.find("td", class_='stats-zvp-4').get_text())
            except:    
                mapdictb.append('NOPE')
            try:
                mapdictb.append(int(soup.find("td", class_='stats-pvt-1').get_text()))
            except:
                mapdictb.append('NOPE')
            try:
                mapdictb.append(soup.find("td", class_='stats-pvt-4').get_text())
            except:
                mapdictb.append('NOPE')
            try:
                soup.find("div", class_='infobox-cell-2')
                address = soup.find('div', attrs={'class' : 'infobox-cell-2'}, text="Size:")
                b_tag = address.parent
                map_size = b_tag.findChildren()[1].get_text()
                mapdictb.append(map_size)
            except:
                mapdictb.append('NOPE')
            mapdictb.append(full_url)
                
                
                
        #__________________________Get and Save Map Images___________________
         try:
             browser.click_link_by_partial_href(f"/starcraft2/File:")
             time.sleep(1)
             try:
                 browser.click_link_by_partial_href("/commons/images")
                 time.sleep(1)
                 html2 = browser.html
                 soup2 = BeautifulSoup(html2)
                 img_link = soup2.img.attrs['src']
                 urllib.request.urlretrieve(img_link, f"C:/Users/donis/Desktop/American Thunder/MapDemo/{item}.jpg")
             except:
                 print(f"couldn't get map image for {item}")
         except:
             print(f"Couldn't find href for {item}")
            
        
        
        # In[11]:
        
        
        mapdict
        
        
        # In[12]:
        
        
        test1 = pd.DataFrame.from_dict(mapdict, orient='index', columns=['tvz count', 'tvz winrate', 'zvp count','zvp winrate', 'pvt count','pvt winrate', 'map size', 'link'  ])
        test1
        #test1.rename(columns={'1':'tvz count', '2':'tvz winrate', '3': 'zvp count', '4':'zvp winrate', '5':'pvt count', '6': 'pvt winrate'})
        
        
        # In[13]:
        
        
        test1.to_csv("C:/Users/donis/Desktop/American Thunder League/MapCSV/full_list.csv")
        
        
        #  I applied a minor correction to the csv file. Just easier that way. Like one of the map names had a * next to it. 
        # 
        #  So next, I import the slightly corrected csv file as a new dataframe 
        
        # In[16]:
        
        
        first_guy = pd.read_csv("MapCSV/full_list.csv")
        
        
        # In[17]:
        
        
        first_guy.head()
        
        
        # In[16]:
        
        
        full_maps_df = first_guy.rename(columns = {"Unnamed: 0": "Map"})
        full_maps_df.head(40)
        
        
        # In[17]:
        
        
        # Need to get rid of the %, then convert to a float. 
        
        
        # In[18]:
        
        
        # tvz_balanced_df = full_maps_df.loc(full_maps_df['tvz count'] >= 50 and full_maps_df['tvz winrate'] )
        full_maps_df['tvz winrate'] = full_maps_df['tvz winrate'].str.replace('%','').replace('NOPE', np.nan).astype(float)
        full_maps_df['zvp winrate'] = full_maps_df['zvp winrate'].str.replace('%','').replace('NOPE', np.nan).replace('-', np.nan).astype(float)
        full_maps_df['pvt winrate'] = full_maps_df['pvt winrate'].str.replace('%','').replace('NOPE', np.nan).replace('-', np.nan).astype(float)
        full_maps_df
        
        
        # In[39]:
        
        
        full_maps_df.at[117, 'map size'] = '140x140'
        full_maps_df2 = full_maps_df.replace("NOPE", np.nan)
        
        full_maps_df2['length'] = full_maps_df2['map size'].str[0:3].astype(int)
        full_maps_df2['width'] = full_maps_df2['map size'].str[4:7].astype(int)
        
        # full_maps_df2['width'] = full_maps_df2['width'].astype(int)
        
        full_maps_df2['total size'] = full_maps_df2['length'] * full_maps_df2['width']
        
        # print(full_maps_df2[full_maps_df2['width'].isnull()])
        
        full_maps_df2.tail(10)
        
        
        # In[40]:
        
        
        full_maps_df2.at[96, 'map size'] = '148x156'
        full_maps_df2.at[102, 'map size'] = '172x124'
        full_maps_df2.at[102, 'Map'] = "Newkirk_Precinct"
        
        
        # In[41]:
        
        
        full_maps_df2.to_csv("C:/Users/donis/Desktop/American Thunder League/MapCSV/true_full_list.csv")
        
        
        # In[32]:
        
        
        wildcard_maps = full_maps_df2[full_maps_df2.isnull().any(axis=1)]
        
        wildcard_maps.tail(10)
        
        
        # In[33]:
        
        
        maps_with_some_games = full_maps_df2.dropna()
        
        tvz_maps_low_count = maps_with_some_games.loc[maps_with_some_games['tvz count'].astype(int) <= 30]
        zvp_maps_low_count = maps_with_some_games.loc[maps_with_some_games['zvp count'].astype(int) <= 30]
        pvt_maps_low_count = maps_with_some_games.loc[maps_with_some_games['pvt count'].astype(int) <= 30]
        tvz_maps_balanced = maps_with_some_games.loc[(maps_with_some_games['tvz winrate'] > 46) & (maps_with_some_games['tvz winrate'] < 54)].drop(columns = ['zvp count', 'zvp winrate', 'pvt count', 'pvt winrate'])
        zvp_maps_balanced = maps_with_some_games.loc[(maps_with_some_games['zvp winrate'] > 46) & (maps_with_some_games['zvp winrate'] < 54)].drop(columns = ['tvz count', 'tvz winrate', 'pvt count', 'pvt winrate'])
        pvt_maps_balanced = maps_with_some_games.loc[(maps_with_some_games['pvt winrate'] > 46) & (maps_with_some_games['pvt winrate'] < 54)].drop(columns = ['zvp count', 'zvp winrate', 'tvz count', 'tvz winrate'])
        len(tvz_maps_balanced), len(zvp_maps_balanced), len(pvt_maps_balanced)
        
        #tvz_maps_balanced = 
        #zvt_maps_low_count = 
        #zvt_maps_balanced = 
        #pvt_maps_low_count = full_maps_df2
        #pvt_maps_balanced = 
        
        
        # In[ ]: