-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathname.html
More file actions
481 lines (336 loc) · 28.1 KB
/
Copy pathname.html
File metadata and controls
481 lines (336 loc) · 28.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
<!DOCTYPE html>
<!-- saved from url=(0014)about:internet -->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<meta http-equiv="x-ua-compatible" content="IE=9" >
<title>Getting and Cleaning Name Data from the U.S. Census Bureau and SSA for More (A)merican Random Sampling</title>
<style type="text/css">
body, td {
font-family: sans-serif;
background-color: white;
font-size: 12px;
margin: 8px;
}
tt, code, pre {
font-family: 'DejaVu Sans Mono', 'Droid Sans Mono', 'Lucida Console', Consolas, Monaco, monospace;
}
h1 {
font-size:2.2em;
}
h2 {
font-size:1.8em;
}
h3 {
font-size:1.4em;
}
h4 {
font-size:1.0em;
}
h5 {
font-size:0.9em;
}
h6 {
font-size:0.8em;
}
a:visited {
color: rgb(50%, 0%, 50%);
}
pre {
margin-top: 0;
max-width: 95%;
border: 1px solid #ccc;
white-space: pre-wrap;
}
pre code {
display: block; padding: 0.5em;
}
code.r, code.cpp {
background-color: #F8F8F8;
}
table, td, th {
border: none;
}
blockquote {
color:#666666;
margin:0;
padding-left: 1em;
border-left: 0.5em #EEE solid;
}
hr {
height: 0px;
border-bottom: none;
border-top-width: thin;
border-top-style: dotted;
border-top-color: #999999;
}
@media print {
* {
background: transparent !important;
color: black !important;
filter:none !important;
-ms-filter: none !important;
}
body {
font-size:12pt;
max-width:100%;
}
a, a:visited {
text-decoration: underline;
}
hr {
visibility: hidden;
page-break-before: always;
}
pre, blockquote {
padding-right: 1em;
page-break-inside: avoid;
}
tr, img {
page-break-inside: avoid;
}
img {
max-width: 100% !important;
}
@page :left {
margin: 15mm 20mm 15mm 10mm;
}
@page :right {
margin: 15mm 10mm 15mm 20mm;
}
p, h2, h3 {
orphans: 3; widows: 3;
}
h2, h3 {
page-break-after: avoid;
}
}
</style>
<!-- Styles for R syntax highlighter -->
<style type="text/css">
pre .operator,
pre .paren {
color: rgb(104, 118, 135)
}
pre .literal {
color: rgb(88, 72, 246)
}
pre .number {
color: rgb(0, 0, 205);
}
pre .comment {
color: rgb(76, 136, 107);
}
pre .keyword {
color: rgb(0, 0, 255);
}
pre .identifier {
color: rgb(0, 0, 0);
}
pre .string {
color: rgb(3, 106, 7);
}
</style>
<!-- R syntax highlighter -->
<script type="text/javascript">
var hljs=new function(){function m(p){return p.replace(/&/gm,"&").replace(/</gm,"<")}function f(r,q,p){return RegExp(q,"m"+(r.cI?"i":"")+(p?"g":""))}function b(r){for(var p=0;p<r.childNodes.length;p++){var q=r.childNodes[p];if(q.nodeName=="CODE"){return q}if(!(q.nodeType==3&&q.nodeValue.match(/\s+/))){break}}}function h(t,s){var p="";for(var r=0;r<t.childNodes.length;r++){if(t.childNodes[r].nodeType==3){var q=t.childNodes[r].nodeValue;if(s){q=q.replace(/\n/g,"")}p+=q}else{if(t.childNodes[r].nodeName=="BR"){p+="\n"}else{p+=h(t.childNodes[r])}}}if(/MSIE [678]/.test(navigator.userAgent)){p=p.replace(/\r/g,"\n")}return p}function a(s){var r=s.className.split(/\s+/);r=r.concat(s.parentNode.className.split(/\s+/));for(var q=0;q<r.length;q++){var p=r[q].replace(/^language-/,"");if(e[p]){return p}}}function c(q){var p=[];(function(s,t){for(var r=0;r<s.childNodes.length;r++){if(s.childNodes[r].nodeType==3){t+=s.childNodes[r].nodeValue.length}else{if(s.childNodes[r].nodeName=="BR"){t+=1}else{if(s.childNodes[r].nodeType==1){p.push({event:"start",offset:t,node:s.childNodes[r]});t=arguments.callee(s.childNodes[r],t);p.push({event:"stop",offset:t,node:s.childNodes[r]})}}}}return t})(q,0);return p}function k(y,w,x){var q=0;var z="";var s=[];function u(){if(y.length&&w.length){if(y[0].offset!=w[0].offset){return(y[0].offset<w[0].offset)?y:w}else{return w[0].event=="start"?y:w}}else{return y.length?y:w}}function t(D){var A="<"+D.nodeName.toLowerCase();for(var B=0;B<D.attributes.length;B++){var C=D.attributes[B];A+=" "+C.nodeName.toLowerCase();if(C.value!==undefined&&C.value!==false&&C.value!==null){A+='="'+m(C.value)+'"'}}return A+">"}while(y.length||w.length){var v=u().splice(0,1)[0];z+=m(x.substr(q,v.offset-q));q=v.offset;if(v.event=="start"){z+=t(v.node);s.push(v.node)}else{if(v.event=="stop"){var p,r=s.length;do{r--;p=s[r];z+=("</"+p.nodeName.toLowerCase()+">")}while(p!=v.node);s.splice(r,1);while(r<s.length){z+=t(s[r]);r++}}}}return z+m(x.substr(q))}function j(){function q(x,y,v){if(x.compiled){return}var u;var s=[];if(x.k){x.lR=f(y,x.l||hljs.IR,true);for(var w in x.k){if(!x.k.hasOwnProperty(w)){continue}if(x.k[w] instanceof Object){u=x.k[w]}else{u=x.k;w="keyword"}for(var r in u){if(!u.hasOwnProperty(r)){continue}x.k[r]=[w,u[r]];s.push(r)}}}if(!v){if(x.bWK){x.b="\\b("+s.join("|")+")\\s"}x.bR=f(y,x.b?x.b:"\\B|\\b");if(!x.e&&!x.eW){x.e="\\B|\\b"}if(x.e){x.eR=f(y,x.e)}}if(x.i){x.iR=f(y,x.i)}if(x.r===undefined){x.r=1}if(!x.c){x.c=[]}x.compiled=true;for(var t=0;t<x.c.length;t++){if(x.c[t]=="self"){x.c[t]=x}q(x.c[t],y,false)}if(x.starts){q(x.starts,y,false)}}for(var p in e){if(!e.hasOwnProperty(p)){continue}q(e[p].dM,e[p],true)}}function d(B,C){if(!j.called){j();j.called=true}function q(r,M){for(var L=0;L<M.c.length;L++){if((M.c[L].bR.exec(r)||[null])[0]==r){return M.c[L]}}}function v(L,r){if(D[L].e&&D[L].eR.test(r)){return 1}if(D[L].eW){var M=v(L-1,r);return M?M+1:0}return 0}function w(r,L){return L.i&&L.iR.test(r)}function K(N,O){var M=[];for(var L=0;L<N.c.length;L++){M.push(N.c[L].b)}var r=D.length-1;do{if(D[r].e){M.push(D[r].e)}r--}while(D[r+1].eW);if(N.i){M.push(N.i)}return f(O,M.join("|"),true)}function p(M,L){var N=D[D.length-1];if(!N.t){N.t=K(N,E)}N.t.lastIndex=L;var r=N.t.exec(M);return r?[M.substr(L,r.index-L),r[0],false]:[M.substr(L),"",true]}function z(N,r){var L=E.cI?r[0].toLowerCase():r[0];var M=N.k[L];if(M&&M instanceof Array){return M}return false}function F(L,P){L=m(L);if(!P.k){return L}var r="";var O=0;P.lR.lastIndex=0;var M=P.lR.exec(L);while(M){r+=L.substr(O,M.index-O);var N=z(P,M);if(N){x+=N[1];r+='<span class="'+N[0]+'">'+M[0]+"</span>"}else{r+=M[0]}O=P.lR.lastIndex;M=P.lR.exec(L)}return r+L.substr(O,L.length-O)}function J(L,M){if(M.sL&&e[M.sL]){var r=d(M.sL,L);x+=r.keyword_count;return r.value}else{return F(L,M)}}function I(M,r){var L=M.cN?'<span class="'+M.cN+'">':"";if(M.rB){y+=L;M.buffer=""}else{if(M.eB){y+=m(r)+L;M.buffer=""}else{y+=L;M.buffer=r}}D.push(M);A+=M.r}function G(N,M,Q){var R=D[D.length-1];if(Q){y+=J(R.buffer+N,R);return false}var P=q(M,R);if(P){y+=J(R.buffer+N,R);I(P,M);return P.rB}var L=v(D.length-1,M);if(L){var O=R.cN?"</span>":"";if(R.rE){y+=J(R.buffer+N,R)+O}else{if(R.eE){y+=J(R.buffer+N,R)+O+m(M)}else{y+=J(R.buffer+N+M,R)+O}}while(L>1){O=D[D.length-2].cN?"</span>":"";y+=O;L--;D.length--}var r=D[D.length-1];D.length--;D[D.length-1].buffer="";if(r.starts){I(r.starts,"")}return R.rE}if(w(M,R)){throw"Illegal"}}var E=e[B];var D=[E.dM];var A=0;var x=0;var y="";try{var s,u=0;E.dM.buffer="";do{s=p(C,u);var t=G(s[0],s[1],s[2]);u+=s[0].length;if(!t){u+=s[1].length}}while(!s[2]);if(D.length>1){throw"Illegal"}return{r:A,keyword_count:x,value:y}}catch(H){if(H=="Illegal"){return{r:0,keyword_count:0,value:m(C)}}else{throw H}}}function g(t){var p={keyword_count:0,r:0,value:m(t)};var r=p;for(var q in e){if(!e.hasOwnProperty(q)){continue}var s=d(q,t);s.language=q;if(s.keyword_count+s.r>r.keyword_count+r.r){r=s}if(s.keyword_count+s.r>p.keyword_count+p.r){r=p;p=s}}if(r.language){p.second_best=r}return p}function i(r,q,p){if(q){r=r.replace(/^((<[^>]+>|\t)+)/gm,function(t,w,v,u){return w.replace(/\t/g,q)})}if(p){r=r.replace(/\n/g,"<br>")}return r}function n(t,w,r){var x=h(t,r);var v=a(t);var y,s;if(v){y=d(v,x)}else{return}var q=c(t);if(q.length){s=document.createElement("pre");s.innerHTML=y.value;y.value=k(q,c(s),x)}y.value=i(y.value,w,r);var u=t.className;if(!u.match("(\\s|^)(language-)?"+v+"(\\s|$)")){u=u?(u+" "+v):v}if(/MSIE [678]/.test(navigator.userAgent)&&t.tagName=="CODE"&&t.parentNode.tagName=="PRE"){s=t.parentNode;var p=document.createElement("div");p.innerHTML="<pre><code>"+y.value+"</code></pre>";t=p.firstChild.firstChild;p.firstChild.cN=s.cN;s.parentNode.replaceChild(p.firstChild,s)}else{t.innerHTML=y.value}t.className=u;t.result={language:v,kw:y.keyword_count,re:y.r};if(y.second_best){t.second_best={language:y.second_best.language,kw:y.second_best.keyword_count,re:y.second_best.r}}}function o(){if(o.called){return}o.called=true;var r=document.getElementsByTagName("pre");for(var p=0;p<r.length;p++){var q=b(r[p]);if(q){n(q,hljs.tabReplace)}}}function l(){if(window.addEventListener){window.addEventListener("DOMContentLoaded",o,false);window.addEventListener("load",o,false)}else{if(window.attachEvent){window.attachEvent("onload",o)}else{window.onload=o}}}var e={};this.LANGUAGES=e;this.highlight=d;this.highlightAuto=g;this.fixMarkup=i;this.highlightBlock=n;this.initHighlighting=o;this.initHighlightingOnLoad=l;this.IR="[a-zA-Z][a-zA-Z0-9_]*";this.UIR="[a-zA-Z_][a-zA-Z0-9_]*";this.NR="\\b\\d+(\\.\\d+)?";this.CNR="\\b(0[xX][a-fA-F0-9]+|(\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)";this.BNR="\\b(0b[01]+)";this.RSR="!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|\\.|-|-=|/|/=|:|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~";this.ER="(?![\\s\\S])";this.BE={b:"\\\\.",r:0};this.ASM={cN:"string",b:"'",e:"'",i:"\\n",c:[this.BE],r:0};this.QSM={cN:"string",b:'"',e:'"',i:"\\n",c:[this.BE],r:0};this.CLCM={cN:"comment",b:"//",e:"$"};this.CBLCLM={cN:"comment",b:"/\\*",e:"\\*/"};this.HCM={cN:"comment",b:"#",e:"$"};this.NM={cN:"number",b:this.NR,r:0};this.CNM={cN:"number",b:this.CNR,r:0};this.BNM={cN:"number",b:this.BNR,r:0};this.inherit=function(r,s){var p={};for(var q in r){p[q]=r[q]}if(s){for(var q in s){p[q]=s[q]}}return p}}();hljs.LANGUAGES.cpp=function(){var a={keyword:{"false":1,"int":1,"float":1,"while":1,"private":1,"char":1,"catch":1,"export":1,virtual:1,operator:2,sizeof:2,dynamic_cast:2,typedef:2,const_cast:2,"const":1,struct:1,"for":1,static_cast:2,union:1,namespace:1,unsigned:1,"long":1,"throw":1,"volatile":2,"static":1,"protected":1,bool:1,template:1,mutable:1,"if":1,"public":1,friend:2,"do":1,"return":1,"goto":1,auto:1,"void":2,"enum":1,"else":1,"break":1,"new":1,extern:1,using:1,"true":1,"class":1,asm:1,"case":1,typeid:1,"short":1,reinterpret_cast:2,"default":1,"double":1,register:1,explicit:1,signed:1,typename:1,"try":1,"this":1,"switch":1,"continue":1,wchar_t:1,inline:1,"delete":1,alignof:1,char16_t:1,char32_t:1,constexpr:1,decltype:1,noexcept:1,nullptr:1,static_assert:1,thread_local:1,restrict:1,_Bool:1,complex:1},built_in:{std:1,string:1,cin:1,cout:1,cerr:1,clog:1,stringstream:1,istringstream:1,ostringstream:1,auto_ptr:1,deque:1,list:1,queue:1,stack:1,vector:1,map:1,set:1,bitset:1,multiset:1,multimap:1,unordered_set:1,unordered_map:1,unordered_multiset:1,unordered_multimap:1,array:1,shared_ptr:1}};return{dM:{k:a,i:"</",c:[hljs.CLCM,hljs.CBLCLM,hljs.QSM,{cN:"string",b:"'\\\\?.",e:"'",i:"."},{cN:"number",b:"\\b(\\d+(\\.\\d*)?|\\.\\d+)(u|U|l|L|ul|UL|f|F)"},hljs.CNM,{cN:"preprocessor",b:"#",e:"$"},{cN:"stl_container",b:"\\b(deque|list|queue|stack|vector|map|set|bitset|multiset|multimap|unordered_map|unordered_set|unordered_multiset|unordered_multimap|array)\\s*<",e:">",k:a,r:10,c:["self"]}]}}}();hljs.LANGUAGES.r={dM:{c:[hljs.HCM,{cN:"number",b:"\\b0[xX][0-9a-fA-F]+[Li]?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+(?:[eE][+\\-]?\\d*)?L\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+\\.(?!\\d)(?:i\\b)?",e:hljs.IMMEDIATE_RE,r:1},{cN:"number",b:"\\b\\d+(?:\\.\\d*)?(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\.\\d+(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"keyword",b:"(?:tryCatch|library|setGeneric|setGroupGeneric)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\.",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\d+(?![\\w.])",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\b(?:function)",e:hljs.IMMEDIATE_RE,r:2},{cN:"keyword",b:"(?:if|in|break|next|repeat|else|for|return|switch|while|try|stop|warning|require|attach|detach|source|setMethod|setClass)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"literal",b:"(?:NA|NA_integer_|NA_real_|NA_character_|NA_complex_)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"literal",b:"(?:NULL|TRUE|FALSE|T|F|Inf|NaN)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"identifier",b:"[a-zA-Z.][a-zA-Z0-9._]*\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"<\\-(?!\\s*\\d)",e:hljs.IMMEDIATE_RE,r:2},{cN:"operator",b:"\\->|<\\-",e:hljs.IMMEDIATE_RE,r:1},{cN:"operator",b:"%%|~",e:hljs.IMMEDIATE_RE},{cN:"operator",b:">=|<=|==|!=|\\|\\||&&|=|\\+|\\-|\\*|/|\\^|>|<|!|&|\\||\\$|:",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"%",e:"%",i:"\\n",r:1},{cN:"identifier",b:"`",e:"`",r:0},{cN:"string",b:'"',e:'"',c:[hljs.BE],r:0},{cN:"string",b:"'",e:"'",c:[hljs.BE],r:0},{cN:"paren",b:"[[({\\])}]",e:hljs.IMMEDIATE_RE,r:0}]}};
hljs.initHighlightingOnLoad();
</script>
</head>
<body>
<h1>Getting and Cleaning Name Data from the U.S. Census Bureau and SSA for More (A)merican Random Sampling</h1>
<h4>by Austin Routt</h4>
<p>Have you ever wanted to be someone else? God knows I have, my friend, but even taking the first step, by creating a new name at random, always seems to be such a hassle for me; whether its an issue of time or the fact that the first and last name combinations I come up with just aren't believable, I cannot seem to create new personas at a rate that meets my needs. Towards the end of fast and believable random American name generation, I have taken and processed data from the U.S. Census Bureau and Social Security Agency.</p>
<p>What now follows is a brief description of the <strong>malefirstnames.csv</strong>, <strong>femalefirstnames.csv</strong>, and <strong>surnames.csv</strong> data files: what they are, how they were derived, as well as how one might use them. Again, the primary purpose for this data set is to be used to implement a random name generator; unlike most random name generators, this data set allows one to take into account a weighted distribution of American first and last names when performing a random sampling by gender. Since the first name data contains the probabilities of names based on the occurrences of U.S. baby names from 1880 to 2013, and the surname data has frequencies based on the 2000 census, one need only reference the probabilities of these names when taking a sample. <strong>e.g. sample(male$name, size = 1, prob = male$probability)</strong> Although merely easing the implementation of fast and believable random millennium American name generation is its primary aim, feel free to reuse this data in anyway you see fit.</p>
<h2>First Name Data</h2>
<p>Both <strong>malefirstnames.csv</strong> and <strong>femalefirstnames.csv</strong> contain lists of male and female first names, respectively. Also, these hold information regarding the frequency of each name's occurrence, within the national U.S. population for people born from 1880 to 2013. The frequency data is based off of the <a href="http://www.ssa.gov/oact/babynames/names.zip">National Data</a> on baby names, provided by the Social Security Administration.</p>
<p>Here is a list of the top 10 male and female names:</p>
<pre><code>## Male Freq Prob Female Freq Prob
## 1 James 5091189 0.03147 Mary 4046787 0.025708
## 2 John 5073958 0.03136 Elizabeth 1591439 0.010110
## 3 Robert 4789776 0.02961 Patricia 1570123 0.009975
## 4 Michael 4292994 0.02653 Jennifer 1461136 0.009282
## 5 William 4038447 0.02496 Linda 1449996 0.009211
## 6 David 3562957 0.02202 Barbara 1432413 0.009100
## 7 Joseph 2552666 0.01578 Margaret 1234912 0.007845
## 8 Richard 2551558 0.01577 Susan 1116573 0.007093
## 9 Charles 2345723 0.01450 Dorothy 1105005 0.007020
## 10 Thomas 2275889 0.01407 Sarah 1054061 0.006696
</code></pre>
<h2>Last Name Data</h2>
<p>The file <strong>surnames.csv</strong> contains a list of last names, as well as their probability of occurrence, within the United States, circa 2000; these were taken from the <a href="https://www.census.gov/genealogy/www/data/2000surnames/index.html">Census 2000 Data</a> on <a href="https://www.census.gov/genealogy/www/data/2000surnames/names.zip">Surnames occurring 100 or more times</a></p>
<p>Here is a list of the top 10 last names:</p>
<pre><code>## Last Names Freq Prob
## 1 Smith 2376206 0.009814
## 2 Johnson 1857160 0.007670
## 3 Williams 1534042 0.006336
## 4 Brown 1380145 0.005700
## 5 Jones 1362755 0.005628
## 6 Miller 1127803 0.004658
## 7 Davis 1072335 0.004429
## 8 Garcia 858289 0.003545
## 9 Rodriguez 804240 0.003322
## 10 Wilson 783051 0.003234
</code></pre>
<h2>Data Processing</h2>
<p>First, the original data sets were obtained and unzipped into a working directory via the following code blocks:</p>
<pre><code class="r">##Step 1a Fetch Social Security National Baby Name data, if not present, and unzip in the user's working directory
myzip1 = "names.zip"
#if data file has not yet been downloaded, fetch it
if (!file.exists(myzip1)) {
download.file("http://www.ssa.gov/oact/babynames/names.zip", destfile=myzip1,method="curl")
unzip(myzip1)
}
##Step 1b Fetch Census 2000 data for surnames occurring 100 or more times, if not present, and unzip in the user's working directory
myzip2 = "surnames.zip"
#if data file has not yet been downloaded, fetch it
if (!file.exists(myzip2)) {
download.file("https://www.census.gov/genealogy/www/data/2000surnames/names.zip", destfile=myzip2,method="curl")
unzip(myzip2)
}
</code></pre>
<p>Next, the R environment was checked to see if the required data sets were already present in memory. Since they were not, they were promptly read into memory;note that the first name data, obtained from the Social Security Administration, is split into multiple text files based on year.</p>
<pre><code class="r">##Step 2a, if baby name data isn't already available in memory read in all files
if(!(exists("datalist"))){
filenames <- list.files(".", pattern="*.txt", full.names=TRUE)
datalist<- lapply(filenames, read.table, sep = ",", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
}
##Step 2b, if surname data isn't already available in memory read in the file
if(!(exists("surname"))){
surname <- read.csv("app_c.csv", stringsAsFactors = F)
}
</code></pre>
<p>Since the Baby name data is split into a list of data frames based on year, <strong>datalist</strong>, these are merged into one and then aggregated to give the total occurrence of each unique name. Following the merge, the data was divided by gender into two data frames: <strong>male</strong> and <strong>female</strong>. Both were then reordered based on each name's total frequency of occurrence.</p>
<pre><code class="r">##Step 3a Merge all name data frames into 1
merge.all <- function(x, y){
merge(x, y, all=TRUE) }
out <- Reduce(merge.all, datalist)
##Step4a aggregate all names by adding the sum occurrence of each
h <- aggregate(formula = V3 ~ V1 + V2, FUN = sum, data = out)
##Step5a, separate by gender and reorder from most frequent to least
female <- h[h$V2 == "F", ]
male <- h[h$V2 == "M", ]
male <- male[order(male$V3, decreasing = T), c(1,3) ]
female <- female[order(female$V3, decreasing = T), c(1,3) ]
</code></pre>
<p>For my current purposes, all 11 original variables of the surname data set, although interesting, are unecessary. In the future I may find a use for the racial probablity data, but lacking the same information for first names as well makes it extraneous at this point. Thus, using R's subsetting capablities, only the names and their corresponding frequencies were retained.</p>
<pre><code class="r">##Step5b, retain only the surname and frequency in dataframe
surname <- surname[, c(1,3)]
</code></pre>
<p>Next, using the count/frequency data for each data frame, probablities were calculated for all names; the probablility of a name is the name's count divided by the total count of all names within a data frame. For instance, over 242,121,073 people took the 2000 Census, 2,376,206 people shared the last name “<strong>Smith</strong>.” Thusly, given a random sample of people who took the 2000 census, the probablity of one of them having the last name Smith, the most popular surname in America, would be about 0.98%.</p>
<pre><code class="r">##Step6a find each name's probablility, for both males and females, and store this information in a third column
for(i in 1:length(male[,1])){
male[i,3] <- male[i,2]/sum(male$V3)
}
for(i in 1:length(female[,1])){
female[i,3] <- female[i,2]/sum(female$V3)
}
##Step6b find each last name's probablility, store this information in a third column
for(i in 1:length(surname[,1])){
surname[i,3] <- surname[i,2]/sum(surname[,2])
}
</code></pre>
<p>For clarity, column names are changed into the more descriptive <strong>name</strong> <strong>frequency</strong> <strong>probability</strong> format. Also, surnames are altered to only begin with a capital letter.</p>
<pre><code class="r">##Step7a rename each variable to an appropriate description
names(male) <- c("name", "frequency", "probability")
names(female) <- c("name", "frequency", "probability")
##Step7b change surname variable names and the case of names
names(surname) <- c("name", "frequency", "probability")
##Create a function that reads in a string and converts it to have words only begin with a capital letter
r_ucfirst <- function (str) {
paste(toupper(substring(str, 1, 1)), tolower(substring(str, 2)), sep = "")
}
surname$name <- r_ucfirst(surname$name)
</code></pre>
<p>Finally, all data frames are exported to the working directory in the form of comma separated files.</p>
<pre><code class="r">##Step8a write first name data to two separarte .csv files
write.csv(male, file = "malefirstnames.csv",row.names=FALSE)
write.csv(female, file = "femalefirstnames.csv",row.names=FALSE)
##Step8b write surname data to a new .csv files
write.csv(surname, file = "surnames.csv",row.names=FALSE)
</code></pre>
<p>Using a R programming environment, with the appropriate files in your working directory, it is recommended that you copy the following commands to read in the name data:</p>
<ul>
<li> <strong>male <- read.csv(“malefirstnames.csv”, stringsAsFactors=F)</strong></li>
<li> <strong>female <- read.csv(“femalefirstnames.csv”, stringsAsFactors=F)</strong></li>
<li> <strong>last <- read.csv(“surnames.csv”, stringsAsFactors=F)</strong></li>
</ul>
<p>You can then take weighted random samples via:</p>
<ul>
<li><strong>sample(male$name, size = 1, prob = male$probability)</strong></li>
<li><strong>sample(female$name, size = 1, prob = female$probability)</strong></li>
<li><strong>sample(last$name, size = 1, prob = last$probability)</strong></li>
</ul>
<h2>rName:An Example</h2>
<p>Here I have illustrated basic use of the data via a random name generating function, called <strong>rName()</strong>, which takes in a gender word, either <strong>“Male”</strong> or <strong>“Female”</strong>, and outputs a name.</p>
<pre><code class="r">rName <- function(gender = "Male"){
if(!(exists("male"))){
male <- read.csv("malefirstnames.csv", stringsAsFactors=F)
}
if(!(exists("female"))){
female <- read.csv("femalefirstnames.csv", stringsAsFactors=F)
}
if(!(exists("last"))){
last <- read.csv("surnames.csv", stringsAsFactors=F)
}
if(gender == "Male"){
name <- paste(sample(male$name, size = 1, prob = male$probability), sample(last$name, size = 1, prob = last$probability))
} else if(gender == "Female"){
name <- paste(sample(female$name, size = 1, prob = female$probability), sample(last$name, size = 1, prob = last$probability))
}
name
}
</code></pre>
<p>Now I will demonstrate the wonder that is weighted random sampling by using rName to generate 100 American names, behold:</p>
<pre><code class="r">set.seed(5559898)
count = 0
onehundredcoins <- sample(0:1, size = 100, replace = TRUE)
onehundrednames <- rName("Male")
for(i in onehundredcoins){
count = count + 1
if(i == 0){
onehundrednames[count] <- rName("Male")
}
if(i==1){
onehundrednames[count] <- rName("Female")
}
}
onehundrednames
</code></pre>
<pre><code>## [1] "Tanner Guidry" "Edward Hays" "Maggie Kopke"
## [4] "Ella Marth" "Walter Boldt" "Joshua Bridges"
## [7] "Becky Nelson" "Iona Sota" "Henry Slavin"
## [10] "Bryan Powell" "Michelle Munn" "Ann Chavez"
## [13] "Mary Delotto" "Efren Mann" "Nicholas Harter"
## [16] "Terrence Jeudy" "Jody Kramer" "Heather Cano"
## [19] "Christopher Curry" "Xavier White" "Dale Aranda"
## [22] "James Mcpherson" "Sandra Johnson" "Shae Wiant"
## [25] "Linda Works" "Angel Weaver" "Toby Nelson"
## [28] "Tatum Mendoca" "Ashley Elwell" "Ethan Thomas"
## [31] "Richard Dasenbrock" "Gerald Flauding" "Joan Pollock"
## [34] "Christine Womack" "Geoffrey Laun" "Dale Botello"
## [37] "Sarah Garcia" "Angel Reese" "Ryan Mcintire"
## [40] "Pauline Bauer" "Aviyah Perry" "Richard Fajardo"
## [43] "Donna Michel" "Sherrita Bishop" "Norman Mccullar"
## [46] "Erik Reeves" "Jaxxon Gray" "Beth Ottaviani"
## [49] "Yolanda Jackson" "Beverly Jenkins" "Everett Bowman"
## [52] "Jean Valencia" "Brian Massa" "Raymond Graham"
## [55] "Emily Kilpatrick" "Otto Mazzarella" "Mark Spencer"
## [58] "George Sturdavant" "Shamus Barksdale" "Mary Moore"
## [61] "Doris Lassiter" "Patricia Letourneau" "Robert Dopson"
## [64] "Randall Arellano" "Daniel Bast" "John Forbes"
## [67] "Marion Traylor" "Ted Reinke" "Joan Haskell"
## [70] "Amanda Saucier" "Jack Jones" "Maria Hommen"
## [73] "Eugene Tesch" "Katherine French" "Curtis Hoard"
## [76] "Isaac Wood" "Jack Hanson" "Kenneth Richard"
## [79] "Henry Aguirre" "Cody Buitron" "Alison Ngo"
## [82] "Kathleen Fluty" "George Youngblood" "Paula Williams"
## [85] "Douglas Crosby" "Evelyn Jordan" "Daniel Nix"
## [88] "Peggy Duncan" "Gerardo Murphey" "Richard Weasel"
## [91] "Anastasia Hudley" "Ernest Conn" "Madelyn Dimon"
## [94] "Jeremy Smith" "Rudolph Weber" "Joseph Longoria"
## [97] "Nicholas Nelson" "Lois Harris" "Ruth Doria"
## [100] "Evelyn Mcgee"
</code></pre>
</body>
</html>