── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Attaching package: 'flextable'
The following object is masked from 'package:purrr':
compose
data =read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv')
Rows: 2502832 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): county, state, fips
dbl (2): cases, deaths
date (1): date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
covid_data = data
##Question 2:
text =as.Date("2021-02-01")my.date <-as.Date(text)class(my.date)
[1] "Date"
my.state="Colorado"Colorado_data = covid_data |>filter(state == my.state)|>group_by(county)|>arrange(date)|>mutate(new_cases = cases -lag(cases))|>mutate(new_deaths = deaths -lag(deaths))|>ungroup()Colorado_data|>filter(date == my.date)|>slice_max(cases, n=5) |>select(Date = date, County = county, Cases = cases) |>flextable() |>set_caption(caption ="Most Total Cases")
Date
County
Cases
2021-02-01
Denver
55,887
2021-02-01
El Paso
47,890
2021-02-01
Adams
46,592
2021-02-01
Arapahoe
45,993
2021-02-01
Jefferson
34,665
Colorado_data|>filter(date == my.date)|>select(Date = date, County = county, new_cases) |>slice_max(new_cases, n =5) |>flextable() |>set_caption(caption ="Most New Cases")
Rows: 3195 Columns: 67
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): SUMLEV, STATE, COUNTY, STNAME, CTYNAME
dbl (62): REGION, DIVISION, ESTIMATESBASE2020, POPESTIMATE2020, POPESTIMATE2...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#2. some of the common names are STNAME, CTYNAME, DEATHS2021, with 3144 rows & 18 columns #range = 737287 - 741 = 736546max(POP_data$POPESTIMATE2021)
[1] 9809462
min(POP_data$POPESTIMATE2021)
[1] 54
COVID_Data = POP_data|>inner_join(Colorado_data, by ='fips') COVID_Data = COVID_Data|>group_by(CTYNAME)|>select(date, county = CTYNAME,cases, Population = POPESTIMATE2021, deaths = deaths, new_cases, new_deaths)|>filter(date == my.date)|>na.omit(new_cases, new_deaths) |>summarise( cases = (cases/Population), new_cases = (new_cases/Population), new_deaths,county,date) COVID_Data |>filter(date == my.date)|>select(county,cases)|>arrange(-cases)|>head(n =5)|>flextable() |>set_caption(caption ="5 Highest Cumulative Covid Cases Per Capita in Colorado Counties")
county
cases
Crowley County
0.3166521
Bent County
0.2416183
Lincoln County
0.1746757
Logan County
0.1727602
Pitkin County
0.1057853
COVID_Data |>filter(date == my.date)|>select(county,new_cases)|>arrange(-new_cases)|>head(n =5)|>flextable()|>set_caption(caption ='5 Highest Counties for New Covid Cases Per Capita in Colorado')
county
new_cases
Lincoln County
0.012424630
Lake County
0.002839757
Grand County
0.002838938
Cheyenne County
0.002326934
Crowley County
0.002266783
##Question 4
Covid_Population_Data = POP_data %>%inner_join(Colorado_data, by ="fips")Covid_Population_Data|>filter(between(date, my.date -13, my.date)) %>%select(county = CTYNAME,state,cases,new_cases, POPESTIMATE2021,date) %>%group_by(county)|>summarise(lag =sum((new_cases) / ((POPESTIMATE2021/100000))),county,cases,new_cases,state,date)|>distinct(lag)|>ungroup()|>filter(lag >=100)|>select(County = county, Cases = lag) %>%slice_max(Cases, n =5) %>%flextable() |>set_caption(caption ='Watch List Covid Counties')
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
always returns an ungrouped data frame and adjust accordingly.
`summarise()` has grouped output by 'county'. You can override using the
`.groups` argument.
County
Cases
San Miguel County
2,636.139
Lincoln County
2,265.668
Crowley County
1,272.886
Routt County
1,141.352
Grand County
1,085.105
#There are 59 differnet counties that meet the watchlist criteria
##Question 5
death_date =as.Date("2022-01-30") year = lubridate::year(death_date) desired_death_rate = Covid_Population_Data |>filter( '2020-12-31'< date & date <'2022-01-01')|>select(cases,county,date,DEATHS2021,deaths)|>group_by(county)|>mutate(death_rate =100*(sum(deaths)/sum(DEATHS2021)))|>filter(death_rate >=20)|>summarise(death_rate,county,date)
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
always returns an ungrouped data frame and adjust accordingly.
`summarise()` has grouped output by 'county'. You can override using the
`.groups` argument.
desired_death_rate |>ggplot(aes(x= county ,y = death_rate)) +geom_col(aes(color = county)) +labs(title="Counties with 20% or Higher death rate from COVID-19",y ="Death_Rate",x ="County")
Rows: 3221 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): fips
dbl (2): LON, LAT
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(map_data)+borders("state", fill ="gray90", color ="white")+geom_point(aes(x = wmX_c, y = wmY_c, size = cases), color ="red", alpha =.25)
Warning: Duplicated aesthetics after name standardisation: colour
labs(color ="Time",size ="Cases",x ="", y ="",title ="Weighted Center of COVID-19 Cases") +theme(legend_position ="none")
Rows: 3221 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): fips
dbl (2): LON, LAT
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
case_map_data = county_cen|>inner_join(data, by ="fips") |>group_by(state)|>summarise(wmX_c= (sum(LON*cases)/sum(cases)), wmY_c = (sum(LAT*cases)/sum(cases)), wmX_d = (sum(LON*deaths) /sum(deaths)), wmY_d= (sum(LAT*deaths)/sum(deaths)),cases=sum(cases),deaths=sum(deaths))|>ungroup()p1 <-ggplot(case_map_data) +borders("state", fill ="grey90", color ="white")+geom_point(aes(x = wmX_c, y = wmY_c, size = cases), color ="red", alpha = .25) +labs(color ="Time",x ="Longitude",y ="Latitude",size ="cases",title =" Weighted Covid Cases in United States")
Warning: Duplicated aesthetics after name standardisation: colour
p2 <-ggplot(case_map_data) +borders("state", fill ="grey90", color ="white")+geom_point(aes(x = wmX_d, y = wmY_d, size = deaths), color ="blue", alpha = .25) +labs(color ="Time",x ="Longitude",y ="Latitude",size ="deaths",title =" Weighted Covid Deaths in United States")
Warning: Duplicated aesthetics after name standardisation: colour
p1
p2
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
library(patchwork)
Warning: package 'patchwork' was built under R version 4.4.3
(p1|p2)
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
#comparing the two maps, I see no difference. It could be a scale issue, or an issue with my code. It may be an issue with my code, becuase I did observe that the ratio by state is different in my table when dividing sum(cases) by sum(deaths). This means that areas that had a lower case/death ratio had more deaths per cases then other states. I just don't know why this trend didn't show up on my graphs. Kinda dissapointed it didn't show up it would've been cool to see :/