Cyclistic - Bike Share Analysis

Preparing data

  1. Data localization and credibility - LINK License -> LINK

  2. Data organization - CSV file

  3. Data issues - Data has blank lines in the columns with informaction about start and end starions. In this analysis this data will be unnecessery and I decide to left records with blank locations.

Setting up environment

Notes: Setting up R environment by loading the PACKAGES and LIBRARIES

#install.packages("tidyverse")
#install.packages("skimr")
#install.packages("janitor")
#install.packages("lubridate")

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(skimr)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(readxl)

Processing data

  1. Converting data form CSV to EXCEL
  2. Importing data
trip03_22 <- read_excel("C:/Users/kchud/NewOne/PORTFOLIO/Cyclistic_Bikeshare/data/excel/202203-divvy-tripdata.xlsx")
trip02_22 <- read_excel("C:/Users/kchud/NewOne/PORTFOLIO/Cyclistic_Bikeshare/data/excel/202202-divvy-tripdata.xlsx")
trip01_22 <- read_excel("C:/Users/kchud/NewOne/PORTFOLIO/Cyclistic_Bikeshare/data/excel/202201-divvy-tripdata.xlsx")

trip12_21 <- read_excel("C:/Users/kchud/NewOne/PORTFOLIO/Cyclistic_Bikeshare/data/excel/202112-divvy-tripdata.xlsx")
trip11_21 <- read_excel("C:/Users/kchud/NewOne/PORTFOLIO/Cyclistic_Bikeshare/data/excel/202111-divvy-tripdata.xlsx")
trip10_21 <- read_excel("C:/Users/kchud/NewOne/PORTFOLIO/Cyclistic_Bikeshare/data/excel/202110-divvy-tripdata.xlsx")

trip09_21 <- read_excel("C:/Users/kchud/NewOne/PORTFOLIO/Cyclistic_Bikeshare/data/excel/202109-divvy-tripdata.xlsx")
trip08_21 <- read_excel("C:/Users/kchud/NewOne/PORTFOLIO/Cyclistic_Bikeshare/data/excel/202108-divvy-tripdata.xlsx")
trip07_21 <- read_excel("C:/Users/kchud/NewOne/PORTFOLIO/Cyclistic_Bikeshare/data/excel/202107-divvy-tripdata.xlsx")

trip06_21 <- read_excel("C:/Users/kchud/NewOne/PORTFOLIO/Cyclistic_Bikeshare/data/excel/202106-divvy-tripdata.xlsx")
trip05_21 <- read_excel("C:/Users/kchud/NewOne/PORTFOLIO/Cyclistic_Bikeshare/data/excel/202105-divvy-tripdata.xlsx")
trip04_21 <- read_excel("C:/Users/kchud/NewOne/PORTFOLIO/Cyclistic_Bikeshare/data/excel/202104-divvy-tripdata.xlsx")

Wrangling data and combining into a single dataframe

  1. Compering column names
colnames(trip03_22)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(trip02_22)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(trip01_22)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(trip12_21)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(trip11_21)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(trip10_21)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(trip09_21)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(trip08_21)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(trip07_21)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(trip06_21)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(trip05_21)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(trip04_21)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
  1. Checking dataframes
str(trip03_22)
## tibble [284,042 x 13] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:284042] "47EC0A7F82E65D52" "8494861979B0F477" "EFE527AF80B66109" "9F446FD9DEE3F389" ...
##  $ rideable_type     : chr [1:284042] "classic_bike" "electric_bike" "classic_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:284042], format: "2022-03-21 13:45:01" "2022-03-16 09:37:16" ...
##  $ ended_at          : POSIXct[1:284042], format: "2022-03-21 13:51:18" "2022-03-16 09:43:34" ...
##  $ start_station_name: chr [1:284042] "Wabash Ave & Wacker Pl" "Michigan Ave & Oak St" "Broadway & Berwyn Ave" "Wabash Ave & Wacker Pl" ...
##  $ start_station_id  : chr [1:284042] "TA1307000131" "13042" "13109" "TA1307000131" ...
##  $ end_station_name  : chr [1:284042] "Kingsbury St & Kinzie St" "Orleans St & Chestnut St (NEXT Apts)" "Broadway & Ridge Ave" "Franklin St & Jackson Blvd" ...
##  $ end_station_id    : chr [1:284042] "KA1503000043" "620" "15578" "TA1305000025" ...
##  $ start_lat         : num [1:284042] 41.9 41.9 42 41.9 41.9 ...
##  $ start_lng         : num [1:284042] -87.6 -87.6 -87.7 -87.6 -87.6 ...
##  $ end_lat           : num [1:284042] 41.9 41.9 42 41.9 41.9 ...
##  $ end_lng           : num [1:284042] -87.6 -87.6 -87.7 -87.6 -87.7 ...
##  $ member_casual     : chr [1:284042] "member" "member" "member" "member" ...
str(trip02_22)
## tibble [115,609 x 13] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:115609] "E1E065E7ED285C02" "1602DCDC5B30FFE3" "BE7DD2AF4B55C4AF" "A1789BDF844412BE" ...
##  $ rideable_type     : chr [1:115609] "classic_bike" "classic_bike" "classic_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:115609], format: "2022-02-19 18:08:41" "2022-02-20 17:41:30" ...
##  $ ended_at          : POSIXct[1:115609], format: "2022-02-19 18:23:56" "2022-02-20 17:45:56" ...
##  $ start_station_name: chr [1:115609] "State St & Randolph St" "Halsted St & Wrightwood Ave" "State St & Randolph St" "Southport Ave & Waveland Ave" ...
##  $ start_station_id  : chr [1:115609] "TA1305000029" "TA1309000061" "TA1305000029" "13235" ...
##  $ end_station_name  : chr [1:115609] "Clark St & Lincoln Ave" "Southport Ave & Wrightwood Ave" "Canal St & Adams St" "Broadway & Sheridan Rd" ...
##  $ end_station_id    : chr [1:115609] "13179" "TA1307000113" "13011" "13323" ...
##  $ start_lat         : num [1:115609] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:115609] -87.6 -87.6 -87.6 -87.7 -87.6 ...
##  $ end_lat           : num [1:115609] 41.9 41.9 41.9 42 41.9 ...
##  $ end_lng           : num [1:115609] -87.6 -87.7 -87.6 -87.6 -87.6 ...
##  $ member_casual     : chr [1:115609] "member" "member" "member" "member" ...
str(trip01_22)
## tibble [103,770 x 13] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:103770] "C2F7DD78E82EC875" "A6CF8980A652D272" "BD0F91DFF741C66D" "CBB80ED419105406" ...
##  $ rideable_type     : chr [1:103770] "electric_bike" "electric_bike" "classic_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:103770], format: "2022-01-13 11:59:47" "2022-01-10 08:41:56" ...
##  $ ended_at          : POSIXct[1:103770], format: "2022-01-13 12:02:44" "2022-01-10 08:46:17" ...
##  $ start_station_name: chr [1:103770] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Sheffield Ave & Fullerton Ave" "Clark St & Bryn Mawr Ave" ...
##  $ start_station_id  : chr [1:103770] "525" "525" "TA1306000016" "KA1504000151" ...
##  $ end_station_name  : chr [1:103770] "Clark St & Touhy Ave" "Clark St & Touhy Ave" "Greenview Ave & Fullerton Ave" "Paulina St & Montrose Ave" ...
##  $ end_station_id    : chr [1:103770] "RP-007" "RP-007" "TA1307000001" "TA1309000021" ...
##  $ start_lat         : num [1:103770] 42 42 41.9 42 41.9 ...
##  $ start_lng         : num [1:103770] -87.7 -87.7 -87.7 -87.7 -87.6 ...
##  $ end_lat           : num [1:103770] 42 42 41.9 42 41.9 ...
##  $ end_lng           : num [1:103770] -87.7 -87.7 -87.7 -87.7 -87.6 ...
##  $ member_casual     : chr [1:103770] "casual" "casual" "member" "casual" ...
str(trip12_21)
## tibble [247,540 x 13] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:247540] "46F8167220E4431F" "73A77762838B32FD" "4CF42452054F59C5" "3278BA87BF698339" ...
##  $ rideable_type     : chr [1:247540] "electric_bike" "electric_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:247540], format: "2021-12-07 15:06:07" "2021-12-11 03:43:29" ...
##  $ ended_at          : POSIXct[1:247540], format: "2021-12-07 15:13:42" "2021-12-11 04:10:23" ...
##  $ start_station_name: chr [1:247540] "Laflin St & Cullerton St" "LaSalle Dr & Huron St" "Halsted St & North Branch St" "Halsted St & North Branch St" ...
##  $ start_station_id  : chr [1:247540] "13307" "KP1705001026" "KA1504000117" "KA1504000117" ...
##  $ end_station_name  : chr [1:247540] "Morgan St & Polk St" "Clarendon Ave & Leland Ave" "Broadway & Barry Ave" "LaSalle Dr & Huron St" ...
##  $ end_station_id    : chr [1:247540] "TA1307000130" "TA1307000119" "13137" "KP1705001026" ...
##  $ start_lat         : num [1:247540] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:247540] -87.7 -87.6 -87.6 -87.6 -87.7 ...
##  $ end_lat           : num [1:247540] 41.9 42 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:247540] -87.7 -87.7 -87.6 -87.6 -87.6 ...
##  $ member_casual     : chr [1:247540] "member" "casual" "member" "member" ...
str(trip11_21)
## tibble [359,978 x 13] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:359978] "7C00A93E10556E47" "90854840DFD508BA" "0A7D10CDD144061C" "2F3BE33085BCFF02" ...
##  $ rideable_type     : chr [1:359978] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:359978], format: "2021-11-27 13:27:38" "2021-11-27 13:38:25" ...
##  $ ended_at          : POSIXct[1:359978], format: "2021-11-27 13:46:38" "2021-11-27 13:56:10" ...
##  $ start_station_name: chr [1:359978] NA NA NA NA ...
##  $ start_station_id  : chr [1:359978] NA NA NA NA ...
##  $ end_station_name  : chr [1:359978] NA NA NA NA ...
##  $ end_station_id    : chr [1:359978] NA NA NA NA ...
##  $ start_lat         : num [1:359978] 41.9 42 42 41.9 41.9 ...
##  $ start_lng         : num [1:359978] -87.7 -87.7 -87.7 -87.8 -87.6 ...
##  $ end_lat           : num [1:359978] 42 41.9 42 41.9 41.9 ...
##  $ end_lng           : num [1:359978] -87.7 -87.7 -87.7 -87.8 -87.6 ...
##  $ member_casual     : chr [1:359978] "casual" "casual" "casual" "casual" ...
str(trip10_21)
## tibble [631,226 x 13] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:631226] "620BC6107255BF4C" "4471C70731AB2E45" "26CA69D43D15EE14" "362947F0437E1514" ...
##  $ rideable_type     : chr [1:631226] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:631226], format: "2021-10-22 12:46:42" "2021-10-21 09:12:37" ...
##  $ ended_at          : POSIXct[1:631226], format: "2021-10-22 12:49:50" "2021-10-21 09:14:14" ...
##  $ start_station_name: chr [1:631226] "Kingsbury St & Kinzie St" NA NA NA ...
##  $ start_station_id  : chr [1:631226] "KA1503000043" NA NA NA ...
##  $ end_station_name  : chr [1:631226] NA NA NA NA ...
##  $ end_station_id    : chr [1:631226] NA NA NA NA ...
##  $ start_lat         : num [1:631226] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:631226] -87.6 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num [1:631226] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:631226] -87.6 -87.7 -87.7 -87.7 -87.7 ...
##  $ member_casual     : chr [1:631226] "member" "member" "member" "member" ...
str(trip09_21)
## tibble [756,147 x 13] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:756147] "9DC7B962304CBFD8" "F930E2C6872D6B32" "6EF72137900BB910" "78D1DE133B3DBF55" ...
##  $ rideable_type     : chr [1:756147] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:756147], format: "2021-09-28 16:07:10" "2021-09-28 14:24:51" ...
##  $ ended_at          : POSIXct[1:756147], format: "2021-09-28 16:09:54" "2021-09-28 14:40:05" ...
##  $ start_station_name: chr [1:756147] NA NA NA NA ...
##  $ start_station_id  : chr [1:756147] NA NA NA NA ...
##  $ end_station_name  : chr [1:756147] NA NA NA NA ...
##  $ end_station_id    : chr [1:756147] NA NA NA NA ...
##  $ start_lat         : num [1:756147] 41.9 41.9 41.8 41.8 41.9 ...
##  $ start_lng         : num [1:756147] -87.7 -87.6 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num [1:756147] 41.9 42 41.8 41.8 41.9 ...
##  $ end_lng           : num [1:756147] -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ member_casual     : chr [1:756147] "casual" "casual" "casual" "casual" ...
str(trip08_21)
## tibble [804,352 x 13] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:804352] "99103BB87CC6C1BB" "EAFCCCFB0A3FC5A1" "9EF4F46C57AD234D" "5834D3208BFAF1DA" ...
##  $ rideable_type     : chr [1:804352] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:804352], format: "2021-08-10 17:15:49" "2021-08-10 17:23:14" ...
##  $ ended_at          : POSIXct[1:804352], format: "2021-08-10 17:22:44" "2021-08-10 17:39:24" ...
##  $ start_station_name: chr [1:804352] NA NA NA NA ...
##  $ start_station_id  : chr [1:804352] NA NA NA NA ...
##  $ end_station_name  : chr [1:804352] NA NA NA NA ...
##  $ end_station_id    : chr [1:804352] NA NA NA NA ...
##  $ start_lat         : num [1:804352] 41.8 41.8 42 42 41.8 ...
##  $ start_lng         : num [1:804352] -87.7 -87.7 -87.7 -87.7 -87.6 ...
##  $ end_lat           : num [1:804352] 41.8 41.8 42 42 41.8 ...
##  $ end_lng           : num [1:804352] -87.7 -87.6 -87.7 -87.7 -87.6 ...
##  $ member_casual     : chr [1:804352] "member" "member" "member" "member" ...
str(trip07_21)
## tibble [822,410 x 13] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:822410] "0A1B623926EF4E16" "B2D5583A5A5E76EE" "6F264597DDBF427A" "379B58EAB20E8AA5" ...
##  $ rideable_type     : chr [1:822410] "docked_bike" "classic_bike" "classic_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:822410], format: "2021-07-02 14:44:36" "2021-07-07 16:57:42" ...
##  $ ended_at          : POSIXct[1:822410], format: "2021-07-02 15:19:58" "2021-07-07 17:16:09" ...
##  $ start_station_name: chr [1:822410] "Michigan Ave & Washington St" "California Ave & Cortez St" "Wabash Ave & 16th St" "California Ave & Cortez St" ...
##  $ start_station_id  : chr [1:822410] "13001" "17660" "SL-012" "17660" ...
##  $ end_station_name  : chr [1:822410] "Halsted St & North Branch St" "Wood St & Hubbard St" "Rush St & Hubbard St" "Carpenter St & Huron St" ...
##  $ end_station_id    : chr [1:822410] "KA1504000117" "13432" "KA1503000044" "13196" ...
##  $ start_lat         : num [1:822410] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:822410] -87.6 -87.7 -87.6 -87.7 -87.7 ...
##  $ end_lat           : num [1:822410] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:822410] -87.6 -87.7 -87.6 -87.7 -87.7 ...
##  $ member_casual     : chr [1:822410] "casual" "casual" "member" "member" ...
str(trip06_21)
## tibble [729,595 x 13] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:729595] "99FEC93BA843FB20" "06048DCFC8520CAF" "9598066F68045DF2" "B03C0FE48C412214" ...
##  $ rideable_type     : chr [1:729595] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:729595], format: "2021-06-13 14:31:28" "2021-06-04 11:18:02" ...
##  $ ended_at          : POSIXct[1:729595], format: "2021-06-13 14:34:11" "2021-06-04 11:24:19" ...
##  $ start_station_name: chr [1:729595] NA NA NA NA ...
##  $ start_station_id  : chr [1:729595] NA NA NA NA ...
##  $ end_station_name  : chr [1:729595] NA NA NA NA ...
##  $ end_station_id    : chr [1:729595] NA NA NA NA ...
##  $ start_lat         : num [1:729595] 41.8 41.8 41.8 41.8 41.8 ...
##  $ start_lng         : num [1:729595] -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ end_lat           : num [1:729595] 41.8 41.8 41.8 41.8 41.8 ...
##  $ end_lng           : num [1:729595] -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ member_casual     : chr [1:729595] "member" "member" "member" "member" ...
str(trip05_21)
## tibble [531,633 x 13] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:531633] "C809ED75D6160B2A" "DD59FDCE0ACACAF3" "0AB83CB88C43EFC2" "7881AC6D39110C60" ...
##  $ rideable_type     : chr [1:531633] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:531633], format: "2021-05-30 11:58:15" "2021-05-30 11:29:14" ...
##  $ ended_at          : POSIXct[1:531633], format: "2021-05-30 12:10:39" "2021-05-30 12:14:09" ...
##  $ start_station_name: chr [1:531633] NA NA NA NA ...
##  $ start_station_id  : chr [1:531633] NA NA NA NA ...
##  $ end_station_name  : chr [1:531633] NA NA NA NA ...
##  $ end_station_id    : chr [1:531633] NA NA NA NA ...
##  $ start_lat         : num [1:531633] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:531633] -87.6 -87.6 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num [1:531633] 41.9 41.8 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:531633] -87.6 -87.6 -87.7 -87.7 -87.7 ...
##  $ member_casual     : chr [1:531633] "casual" "casual" "casual" "casual" ...
str(trip04_21)
## tibble [337,230 x 13] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:337230] "6C992BD37A98A63F" "1E0145613A209000" "E498E15508A80BAD" "1887262AD101C604" ...
##  $ rideable_type     : chr [1:337230] "classic_bike" "docked_bike" "docked_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:337230], format: "2021-04-12 18:25:36" "2021-04-27 17:27:11" ...
##  $ ended_at          : POSIXct[1:337230], format: "2021-04-12 18:56:55" "2021-04-27 18:31:29" ...
##  $ start_station_name: chr [1:337230] "State St & Pearson St" "Dorchester Ave & 49th St" "Loomis Blvd & 84th St" "Honore St & Division St" ...
##  $ start_station_id  : chr [1:337230] "TA1307000061" "KA1503000069" "20121" "TA1305000034" ...
##  $ end_station_name  : chr [1:337230] "Southport Ave & Waveland Ave" "Dorchester Ave & 49th St" "Loomis Blvd & 84th St" "Southport Ave & Waveland Ave" ...
##  $ end_station_id    : chr [1:337230] "13235" "KA1503000069" "20121" "13235" ...
##  $ start_lat         : num [1:337230] 41.9 41.8 41.7 41.9 41.7 ...
##  $ start_lng         : num [1:337230] -87.6 -87.6 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num [1:337230] 41.9 41.8 41.7 41.9 41.7 ...
##  $ end_lng           : num [1:337230] -87.7 -87.6 -87.7 -87.7 -87.7 ...
##  $ member_casual     : chr [1:337230] "member" "casual" "casual" "member" ...
  1. Combining dataframes into single file
all_trips <- bind_rows(trip03_22, trip02_22, trip01_22, trip12_21, trip11_21, trip10_21, trip09_21, trip08_21, trip07_21, trip06_21, trip05_21, trip04_21)

Cleaning up and adding data

  1. Checking new dataframe
colnames(all_trips)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
nrow(all_trips)
## [1] 5723532
dim(all_trips)
## [1] 5723532      13
head(all_trips)
## # A tibble: 6 x 13
##   ride_id rideable_type started_at          ended_at            start_station_n~
##   <chr>   <chr>         <dttm>              <dttm>              <chr>           
## 1 47EC0A~ classic_bike  2022-03-21 13:45:01 2022-03-21 13:51:18 Wabash Ave & Wa~
## 2 849486~ electric_bike 2022-03-16 09:37:16 2022-03-16 09:43:34 Michigan Ave & ~
## 3 EFE527~ classic_bike  2022-03-23 19:52:02 2022-03-23 19:54:48 Broadway & Berw~
## 4 9F446F~ classic_bike  2022-03-01 19:12:26 2022-03-01 19:22:14 Wabash Ave & Wa~
## 5 431128~ classic_bike  2022-03-21 18:37:01 2022-03-21 19:19:11 DuSable Lake Sh~
## 6 9AA8A1~ classic_bike  2022-03-07 17:10:22 2022-03-07 17:15:04 Bissell St & Ar~
## # ... with 8 more variables: start_station_id <chr>, end_station_name <chr>,
## #   end_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, member_casual <chr>
summary(all_trips)
##    ride_id          rideable_type        started_at                 
##  Length:5723532     Length:5723532     Min.   :2021-04-01 00:03:18  
##  Class :character   Class :character   1st Qu.:2021-06-22 15:20:26  
##  Mode  :character   Mode  :character   Median :2021-08-17 18:25:49  
##                                        Mean   :2021-08-26 22:25:18  
##                                        3rd Qu.:2021-10-14 19:48:10  
##                                        Max.   :2022-03-31 23:59:47  
##                                                                     
##     ended_at                   start_station_name start_station_id  
##  Min.   :2021-04-01 00:14:29   Length:5723532     Length:5723532    
##  1st Qu.:2021-06-22 15:47:37   Class :character   Class :character  
##  Median :2021-08-17 18:44:32   Mode  :character   Mode  :character  
##  Mean   :2021-08-26 22:46:50                                        
##  3rd Qu.:2021-10-14 20:03:28                                        
##  Max.   :2022-04-01 22:10:12                                        
##                                                                     
##  end_station_name   end_station_id       start_lat       start_lng     
##  Length:5723532     Length:5723532     Min.   :41.64   Min.   :-87.84  
##  Class :character   Class :character   1st Qu.:41.88   1st Qu.:-87.66  
##  Mode  :character   Mode  :character   Median :41.90   Median :-87.64  
##                                        Mean   :41.90   Mean   :-87.65  
##                                        3rd Qu.:41.93   3rd Qu.:-87.63  
##                                        Max.   :45.64   Max.   :-73.80  
##                                                                        
##     end_lat         end_lng       member_casual     
##  Min.   :41.39   Min.   :-88.97   Length:5723532    
##  1st Qu.:41.88   1st Qu.:-87.66   Class :character  
##  Median :41.90   Median :-87.64   Mode  :character  
##  Mean   :41.90   Mean   :-87.65                     
##  3rd Qu.:41.93   3rd Qu.:-87.63                     
##  Max.   :42.17   Max.   :-87.49                     
##  NA's   :4716    NA's   :4716
  1. Adding new columns
  • Date
all_trips$date <- as.Date(all_trips$started_at) 
  • Year
all_trips$year <- format(as.Date(all_trips$date), "%Y") 
  • Month
all_trips$month <- format(as.Date(all_trips$date), "%m")
  • Year_Month
all_trips$year_month <- paste(all_trips$year,"_",all_trips$month)
  • day
all_trips$day <- format(as.Date(all_trips$date), "%d")
  • Day of Week
all_trips$day_of_week <- format(as.Date(all_trips$date),"%A")
  • Part of week
all_trips <- all_trips %>% 
  mutate(part_of_week = case_when(day_of_week == "Monday" ~ "Workday",
                                  day_of_week == "Tuesday" ~ "Workday",
                                  day_of_week == "Wednesday" ~ "Workday",
                                  day_of_week == "Thursday" ~ "Workday",
                                  day_of_week == "Friday" ~ "Workday",
                                  day_of_week == "Saturday" ~ "Weekend",
                                  day_of_week == "Sunday" ~ "Weekend"))
  • Day Hour
all_trips$day_hour <- format(as.POSIXct(all_trips$started_at), format = "%H")
  • Ride Length in secounds convert into minutes
all_trips$ride_length_s <- difftime(all_trips$ended_at, all_trips$started_at)
all_trips$ride_length_s <- as.numeric(as.character(all_trips$ride_length_s))
is.numeric(all_trips$ride_length_s)
## [1] TRUE
all_trips <- all_trips %>% 
  mutate(ride_length_m = ride_length_s/60)
all_trips$ride_length_m <- round(all_trips$ride_length_m, digits = 0)
  • Ride numbers
all_trips$ride_number <- rep(c(1),5723532)
is.numeric(all_trips$ride_number)
## [1] TRUE
  1. Chacking new data frame
View(all_trips)
  1. Cleaning data
  • Removing “bad” data
all_trips2 <- all_trips[!(all_trips$ride_length_s < 0),]
nrow(all_trips2)-nrow(all_trips)
## [1] -145
  • Removing duplicaties by column ride_id
all_trips2$ride_id[duplicated(all_trips2$ride_id)]
## character(0)

Conducting descriptive analysis

summary(all_trips2$ride_length_m)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00     7.00    12.00    21.54    21.00 55944.00
mean(all_trips2$ride_length_m)
## [1] 21.54192
median(all_trips2$ride_length_m)
## [1] 12
min(all_trips2$ride_length_m)
## [1] 0
max(all_trips2$ride_length_m)
## [1] 55944
aggregate(all_trips2$ride_length_m ~ all_trips2$member_casual, FUN=mean)
##   all_trips2$member_casual all_trips2$ride_length_m
## 1                   casual                 31.73913
## 2                   member                 13.36824
aggregate(all_trips2$ride_length_m ~ all_trips2$member_casual, FUN=median)
##   all_trips2$member_casual all_trips2$ride_length_m
## 1                   casual                       16
## 2                   member                        9
aggregate(all_trips2$ride_length_m ~ all_trips2$member_casual, FUN=min)
##   all_trips2$member_casual all_trips2$ride_length_m
## 1                   casual                        0
## 2                   member                        0
aggregate(all_trips2$ride_length_m ~ all_trips2$member_casual, FUN=max)
##   all_trips2$member_casual all_trips2$ride_length_m
## 1                   casual                    55944
## 2                   member                     1560
all_trips2$day_of_week <- ordered(all_trips2$day_of_week, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
aggregate(all_trips2$ride_length_m~all_trips2$member_casual+all_trips2$day_of_week, FUN = mean)
##    all_trips2$member_casual all_trips2$day_of_week all_trips2$ride_length_m
## 1                    casual                 Monday                 31.48139
## 2                    member                 Monday                 12.96697
## 3                    casual                Tuesday                 27.43349
## 4                    member                Tuesday                 12.52101
## 5                    casual              Wednesday                 27.76503
## 6                    member              Wednesday                 12.58763
## 7                    casual               Thursday                 27.87952
## 8                    member               Thursday                 12.56968
## 9                    casual                 Friday                 30.10219
## 10                   member                 Friday                 13.13726
## 11                   casual               Saturday                 34.27940
## 12                   member               Saturday                 14.99226
## 13                   casual                 Sunday                 37.40623
## 14                   member                 Sunday                 15.34271

Checking diffreneces between casual and mamber user

  1. Number of rides by User Type
all_trips2 %>% 
  group_by(member_casual) %>% 
  summarise(number_of_ridetrips = n())
## # A tibble: 2 x 2
##   member_casual number_of_ridetrips
##   <chr>                       <int>
## 1 casual                    2546482
## 2 member                    3176905
  1. Number of rides by User Type by Bike Type
all_trips2 %>% 
  group_by(member_casual, rideable_type) %>% 
  summarise(number_of_ridetrips = n())
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 5 x 3
## # Groups:   member_casual [2]
##   member_casual rideable_type number_of_ridetrips
##   <chr>         <chr>                       <int>
## 1 casual        classic_bike              1257615
## 2 casual        docked_bike                303984
## 3 casual        electric_bike              984883
## 4 member        classic_bike              1992992
## 5 member        electric_bike             1183913
  1. Numer of rides by User Type by Weekday
all_trips2 %>% 
  group_by(member_casual, day_of_week) %>% 
  summarise(number_of_ridetrips = n())
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 14 x 3
## # Groups:   member_casual [2]
##    member_casual day_of_week number_of_ridetrips
##    <chr>         <ord>                     <int>
##  1 casual        Monday                   292993
##  2 casual        Tuesday                  276371
##  3 casual        Wednesday                286400
##  4 casual        Thursday                 293632
##  5 casual        Friday                   364277
##  6 casual        Saturday                 550008
##  7 casual        Sunday                   482801
##  8 member        Monday                   439428
##  9 member        Tuesday                  490095
## 10 member        Wednesday                499901
## 11 member        Thursday                 475330
## 12 member        Friday                   453108
## 13 member        Saturday                 431326
## 14 member        Sunday                   387717

Including Plots

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

## `summarise()` has grouped output by 'part_of_week', 'member_casual'. You can
## override using the `.groups` argument.

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

Saving data into CSV for next steps

all_trips3 <- select(all_trips2, -c(started_at, ended_at, start_station_id, end_station_id, ride_length_s, end_lat, end_lng, date, ride_id))

write.csv(all_trips3,"C:/Users/kchud/NewOne/PORTFOLIO/Cyclistic_Bikeshare/data/excel/Cyclistic_new.csv", row.names = TRUE)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.