Contributors: 250
Author |
Tokens |
Token Proportion |
Commits |
Commit Proportion |
Andrew Morton |
1018 |
7.70% |
39 |
4.29% |
Linus Torvalds (pre-git) |
1018 |
7.70% |
105 |
11.55% |
Oleg Nesterov |
919 |
6.95% |
50 |
5.50% |
Christian Brauner |
875 |
6.62% |
19 |
2.09% |
Eric W. Biedermann |
649 |
4.91% |
45 |
4.95% |
Janak Desai |
508 |
3.84% |
4 |
0.44% |
Sebastian Andrzej Siewior |
426 |
3.22% |
9 |
0.99% |
Ingo Molnar |
417 |
3.16% |
43 |
4.73% |
Al Viro |
416 |
3.15% |
14 |
1.54% |
Linus Torvalds |
395 |
2.99% |
31 |
3.41% |
Andrew Lutomirski |
331 |
2.50% |
8 |
0.88% |
Suren Baghdasaryan |
282 |
2.13% |
8 |
0.88% |
Thomas Gleixner |
226 |
1.71% |
16 |
1.76% |
David Hildenbrand |
220 |
1.66% |
3 |
0.33% |
Andrea Arcangeli |
180 |
1.36% |
9 |
0.99% |
Jens Axboe |
172 |
1.30% |
10 |
1.10% |
Nicholas Piggin |
151 |
1.14% |
3 |
0.33% |
Konstantin Khlebnikov |
142 |
1.07% |
5 |
0.55% |
Adrian Reber |
128 |
0.97% |
1 |
0.11% |
Heinrich Schuchardt |
125 |
0.95% |
3 |
0.33% |
Peter Zijlstra |
122 |
0.92% |
20 |
2.20% |
Kees Cook |
117 |
0.89% |
6 |
0.66% |
Michael Christie |
115 |
0.87% |
5 |
0.55% |
Michal Hocko |
113 |
0.85% |
10 |
1.10% |
Vladimir Davydov |
108 |
0.82% |
8 |
0.88% |
Roland McGrath |
91 |
0.69% |
11 |
1.21% |
Eugene Syromiatnikov |
91 |
0.69% |
4 |
0.44% |
Mathieu Desnoyers |
90 |
0.68% |
6 |
0.66% |
Pavel Emelyanov |
90 |
0.68% |
6 |
0.66% |
Hugh Dickins |
84 |
0.64% |
6 |
0.66% |
Liam R. Howlett |
83 |
0.63% |
6 |
0.66% |
Michel Lespinasse |
78 |
0.59% |
4 |
0.44% |
David Howells |
76 |
0.58% |
8 |
0.88% |
Paul E. McKenney |
76 |
0.58% |
9 |
0.99% |
Hoeun Ryu |
74 |
0.56% |
1 |
0.11% |
Peng Zhang |
74 |
0.56% |
1 |
0.11% |
Rik Van Riel |
72 |
0.54% |
9 |
0.99% |
Mateusz Guzik |
71 |
0.54% |
3 |
0.33% |
Jiri Slaby |
66 |
0.50% |
3 |
0.33% |
Tejun Heo |
65 |
0.49% |
4 |
0.44% |
Alexey Gladkov |
63 |
0.48% |
7 |
0.77% |
Daniel Rebelo de Oliveira |
60 |
0.45% |
1 |
0.11% |
Sai Praneeth |
54 |
0.41% |
1 |
0.11% |
Nadia Yvette Chambers |
54 |
0.41% |
4 |
0.44% |
Roman Gushchin |
53 |
0.40% |
4 |
0.44% |
Shakeel Butt |
51 |
0.39% |
4 |
0.44% |
Miaohe Lin |
49 |
0.37% |
2 |
0.22% |
Serge E. Hallyn |
48 |
0.36% |
5 |
0.55% |
Hidehiro Kawai |
45 |
0.34% |
1 |
0.11% |
Daniel Bristot de Oliveira |
45 |
0.34% |
2 |
0.22% |
Alex Thorlton |
45 |
0.34% |
1 |
0.11% |
Eric Dumazet |
44 |
0.33% |
4 |
0.44% |
Yu Zhao |
43 |
0.33% |
1 |
0.11% |
Sukadev Bhattiprolu |
42 |
0.32% |
4 |
0.44% |
Alexey Dobriyan |
42 |
0.32% |
4 |
0.44% |
Hiroshi Shimamoto |
40 |
0.30% |
1 |
0.11% |
Motohiro Kosaki |
40 |
0.30% |
5 |
0.55% |
Christoph Hellwig |
40 |
0.30% |
4 |
0.44% |
Frédéric Weisbecker |
39 |
0.30% |
8 |
0.88% |
Christoph Lameter |
38 |
0.29% |
6 |
0.66% |
Eric Biggers |
38 |
0.29% |
2 |
0.22% |
Aleksa Sarai |
36 |
0.27% |
2 |
0.22% |
Vegard Nossum |
35 |
0.26% |
3 |
0.33% |
Eddy Wu |
32 |
0.24% |
1 |
0.11% |
FUJITA Tomonori |
32 |
0.24% |
1 |
0.11% |
Tetsuo Handa |
32 |
0.24% |
2 |
0.22% |
Matthew Wilcox |
30 |
0.23% |
4 |
0.44% |
John Levon |
29 |
0.22% |
4 |
0.44% |
Daniel Jacobowitz |
28 |
0.21% |
3 |
0.33% |
Wander Lairson Costa |
28 |
0.21% |
1 |
0.11% |
Mandeep Singh Baines |
27 |
0.20% |
2 |
0.22% |
Dmitriy Vyukov |
26 |
0.20% |
3 |
0.33% |
Kamezawa Hiroyuki |
25 |
0.19% |
3 |
0.33% |
Manfred Spraul |
25 |
0.19% |
2 |
0.22% |
Jack Miller |
24 |
0.18% |
1 |
0.11% |
Andrey Konovalov |
24 |
0.18% |
3 |
0.33% |
Pavel Tikhomirov |
23 |
0.17% |
1 |
0.11% |
Kirill V Tkhai |
23 |
0.17% |
1 |
0.11% |
Mel Gorman |
23 |
0.17% |
3 |
0.33% |
Albert D. Cahalan |
23 |
0.17% |
1 |
0.11% |
Mark Rutland |
22 |
0.17% |
2 |
0.22% |
David Windsor |
22 |
0.17% |
2 |
0.22% |
Suresh B. Siddha |
22 |
0.17% |
2 |
0.22% |
Kirill A. Shutemov |
22 |
0.17% |
5 |
0.55% |
Fengguang Wu |
22 |
0.17% |
2 |
0.22% |
Kent Overstreet |
21 |
0.16% |
3 |
0.33% |
Shailabh Nagar |
21 |
0.16% |
3 |
0.33% |
Gabriel Krisman Bertazi |
21 |
0.16% |
4 |
0.44% |
Byungchul Park |
21 |
0.16% |
1 |
0.11% |
David Rientjes |
20 |
0.15% |
3 |
0.33% |
Matthew Dempsky |
20 |
0.15% |
1 |
0.11% |
Sami Tolvanen |
20 |
0.15% |
1 |
0.11% |
Kuniyuki Iwashima |
20 |
0.15% |
1 |
0.11% |
Marco Elver |
19 |
0.14% |
2 |
0.22% |
Christopher Yeoh |
18 |
0.14% |
1 |
0.11% |
Rusty Russell |
18 |
0.14% |
3 |
0.33% |
Aaron Tomlin |
18 |
0.14% |
1 |
0.11% |
Goto Masanori |
17 |
0.13% |
1 |
0.11% |
Dominik Brodowski |
17 |
0.13% |
2 |
0.22% |
Arjan van de Ven |
17 |
0.13% |
3 |
0.33% |
Josef Whiter |
17 |
0.13% |
1 |
0.11% |
Marcos Paulo de Souza |
17 |
0.13% |
2 |
0.22% |
David Mosberger-Tang |
17 |
0.13% |
1 |
0.11% |
Qian Cai |
17 |
0.13% |
1 |
0.11% |
Heiko Carstens |
16 |
0.12% |
2 |
0.22% |
Benjamin Herrenschmidt |
15 |
0.11% |
2 |
0.22% |
Kirill Korotaev |
15 |
0.11% |
1 |
0.11% |
Matt Helsley |
15 |
0.11% |
2 |
0.22% |
Daniel Axtens |
15 |
0.11% |
1 |
0.11% |
Frank Mayhar |
15 |
0.11% |
1 |
0.11% |
Sherry Yang |
15 |
0.11% |
1 |
0.11% |
Will Deacon |
14 |
0.11% |
1 |
0.11% |
Mike Galbraith |
14 |
0.11% |
5 |
0.55% |
Andrey Vagin |
14 |
0.11% |
1 |
0.11% |
Song Liu |
14 |
0.11% |
1 |
0.11% |
Srikar Dronamraju |
13 |
0.10% |
3 |
0.33% |
David Herrmann |
13 |
0.10% |
1 |
0.11% |
Laurent Vivier |
13 |
0.10% |
2 |
0.22% |
Balbir Singh |
13 |
0.10% |
3 |
0.33% |
Davide Libenzi |
13 |
0.10% |
2 |
0.22% |
Elena Reshetova |
13 |
0.10% |
4 |
0.44% |
Gideon Israel Dsouza |
13 |
0.10% |
1 |
0.11% |
Nico Pitre |
13 |
0.10% |
2 |
0.22% |
Davidlohr Bueso A |
12 |
0.09% |
3 |
0.33% |
Josh Triplett |
12 |
0.09% |
1 |
0.11% |
Johannes Weiner |
12 |
0.09% |
2 |
0.22% |
Dave Olien |
12 |
0.09% |
1 |
0.11% |
Paul Menage |
11 |
0.08% |
3 |
0.33% |
Nadav Amit |
11 |
0.08% |
4 |
0.44% |
Waiman Long |
11 |
0.08% |
1 |
0.11% |
Huang Shijie |
11 |
0.08% |
2 |
0.22% |
Jason Gunthorpe |
11 |
0.08% |
3 |
0.33% |
Fenghua Yu |
11 |
0.08% |
4 |
0.44% |
Tony Luck |
11 |
0.08% |
1 |
0.11% |
Jann Horn |
10 |
0.08% |
2 |
0.22% |
Colin Cross |
10 |
0.08% |
1 |
0.11% |
Ravikiran G. Thirumalai |
10 |
0.08% |
1 |
0.11% |
Beau Belgrave |
10 |
0.08% |
1 |
0.11% |
Steven Rostedt |
10 |
0.08% |
3 |
0.33% |
Song Muchun |
10 |
0.08% |
2 |
0.22% |
Eric Paris |
10 |
0.08% |
1 |
0.11% |
Arun K S |
9 |
0.07% |
2 |
0.22% |
Masami Hiramatsu |
9 |
0.07% |
1 |
0.11% |
Michal Simek |
9 |
0.07% |
1 |
0.11% |
Michael Neuling |
9 |
0.07% |
1 |
0.11% |
Akinobu Mita |
8 |
0.06% |
2 |
0.22% |
Alexander Popov |
8 |
0.06% |
1 |
0.11% |
haifeng.xu |
8 |
0.06% |
1 |
0.11% |
Andi Kleen |
8 |
0.06% |
1 |
0.11% |
Alexander Potapenko |
8 |
0.06% |
2 |
0.22% |
Yang Shi |
8 |
0.06% |
1 |
0.11% |
Jay Lan |
8 |
0.06% |
2 |
0.22% |
Jeff Dike |
7 |
0.05% |
1 |
0.11% |
Thomas Hellstrom |
7 |
0.05% |
1 |
0.11% |
Rolf Eike Beer |
7 |
0.05% |
1 |
0.11% |
Srivatsa Vaddagiri |
7 |
0.05% |
2 |
0.22% |
James Hogan |
7 |
0.05% |
1 |
0.11% |
Tobias Klauser |
7 |
0.05% |
1 |
0.11% |
Matthew Bobrowski |
7 |
0.05% |
1 |
0.11% |
Vasiliy Kulikov |
7 |
0.05% |
1 |
0.11% |
Sargun Dhillon |
6 |
0.05% |
1 |
0.11% |
Greg Kroah-Hartman |
6 |
0.05% |
1 |
0.11% |
Xunlei Pang |
6 |
0.05% |
1 |
0.11% |
Andrii Nakryiko |
6 |
0.05% |
1 |
0.11% |
Prasanna Meda |
6 |
0.05% |
1 |
0.11% |
Paul Mundt |
6 |
0.05% |
1 |
0.11% |
Louis Rilling |
6 |
0.05% |
1 |
0.11% |
Alan Cox |
6 |
0.05% |
3 |
0.33% |
Oren Laadan |
6 |
0.05% |
1 |
0.11% |
Avi Kivity |
6 |
0.05% |
1 |
0.11% |
John L. Byrne |
6 |
0.05% |
1 |
0.11% |
Richard Henderson |
6 |
0.05% |
1 |
0.11% |
Eric Sandeen |
6 |
0.05% |
1 |
0.11% |
Ahmed S. Darwish |
6 |
0.05% |
1 |
0.11% |
Hari Bathini |
5 |
0.04% |
1 |
0.11% |
Liu Zixian |
5 |
0.04% |
1 |
0.11% |
Josh Poimboeuf |
5 |
0.04% |
1 |
0.11% |
Peter Xu |
5 |
0.04% |
2 |
0.22% |
Qais Yousef |
5 |
0.04% |
1 |
0.11% |
KP Singh |
5 |
0.04% |
1 |
0.11% |
Miloslav Trmač |
5 |
0.04% |
1 |
0.11% |
Yishai Hadas |
5 |
0.04% |
1 |
0.11% |
Hideaki Yoshifuji / 吉藤英明 |
5 |
0.04% |
1 |
0.11% |
Li Zefan |
5 |
0.04% |
1 |
0.11% |
Yong Zhang |
5 |
0.04% |
1 |
0.11% |
Stefan Metzmacher |
5 |
0.04% |
1 |
0.11% |
Michael Pratt |
5 |
0.04% |
1 |
0.11% |
Stanislaw Gruszka |
5 |
0.04% |
1 |
0.11% |
Emese Revfy |
4 |
0.03% |
2 |
0.22% |
George Anzinger |
4 |
0.03% |
2 |
0.22% |
Ran Xiaokai |
4 |
0.03% |
1 |
0.11% |
Stephen Wilson |
4 |
0.03% |
2 |
0.22% |
Badari Pulavarty |
4 |
0.03% |
1 |
0.11% |
Aaron Lu |
4 |
0.03% |
1 |
0.11% |
Luiz Fernando N. Capitulino |
4 |
0.03% |
1 |
0.11% |
Adrian Bunk |
4 |
0.03% |
1 |
0.11% |
Richard Guy Briggs |
4 |
0.03% |
1 |
0.11% |
Jason A. Donenfeld |
4 |
0.03% |
2 |
0.22% |
Joerg Roedel |
3 |
0.02% |
1 |
0.11% |
Benjamin LaHaise |
3 |
0.02% |
1 |
0.11% |
Arnd Bergmann |
3 |
0.02% |
1 |
0.11% |
Dipankar Sarma |
3 |
0.02% |
1 |
0.11% |
Stas Sergeev |
3 |
0.02% |
1 |
0.11% |
Andrea Righi |
3 |
0.02% |
1 |
0.11% |
Zilvinas Valinskas |
3 |
0.02% |
1 |
0.11% |
Stephen Rothwell |
3 |
0.02% |
1 |
0.11% |
Paul Moore |
3 |
0.02% |
1 |
0.11% |
Alexei Starovoitov |
3 |
0.02% |
1 |
0.11% |
Seth Jennings |
3 |
0.02% |
1 |
0.11% |
Lorenzo Stoakes |
3 |
0.02% |
1 |
0.11% |
Florent Revest |
3 |
0.02% |
1 |
0.11% |
Weilong Chen |
3 |
0.02% |
1 |
0.11% |
Nigel Cunningham |
3 |
0.02% |
1 |
0.11% |
Wang Jinchao |
2 |
0.02% |
1 |
0.11% |
Ben Blum |
2 |
0.02% |
1 |
0.11% |
Lee Schermerhorn |
2 |
0.02% |
2 |
0.22% |
Dario Faggioli |
2 |
0.02% |
1 |
0.11% |
Andries E. Brouwer |
2 |
0.02% |
1 |
0.11% |
Luke Yang |
2 |
0.02% |
1 |
0.11% |
Jim Houston |
2 |
0.02% |
1 |
0.11% |
Martin Schwidefsky |
2 |
0.02% |
1 |
0.11% |
Paul Mackerras |
2 |
0.02% |
1 |
0.11% |
Russell King |
2 |
0.02% |
1 |
0.11% |
Ollie Wild |
2 |
0.02% |
1 |
0.11% |
Gregory Haskins |
2 |
0.02% |
1 |
0.11% |
Aditya Kali |
2 |
0.02% |
1 |
0.11% |
Luca Barbieri |
2 |
0.02% |
1 |
0.11% |
Stephen D. Smalley |
1 |
0.01% |
1 |
0.11% |
Randy Dunlap |
1 |
0.01% |
1 |
0.11% |
Ralf Baechle |
1 |
0.01% |
1 |
0.11% |
David S. Miller |
1 |
0.01% |
1 |
0.11% |
Dave McCracken |
1 |
0.01% |
1 |
0.11% |
Guenter Roeck |
1 |
0.01% |
1 |
0.11% |
Jan Blunck |
1 |
0.01% |
1 |
0.11% |
Siddhesh Poyarekar |
1 |
0.01% |
1 |
0.11% |
Ian Campbell |
1 |
0.01% |
1 |
0.11% |
Valentin Schneider |
1 |
0.01% |
1 |
0.11% |
Jeremy Fitzhardinge |
1 |
0.01% |
1 |
0.11% |
Li Xinhai |
1 |
0.01% |
1 |
0.11% |
Arnaldo Carvalho de Melo |
1 |
0.01% |
1 |
0.11% |
Will Drewry |
1 |
0.01% |
1 |
0.11% |
Kinsey Ho |
1 |
0.01% |
1 |
0.11% |
Lin Feng |
1 |
0.01% |
1 |
0.11% |
Veaceslav Falico |
1 |
0.01% |
1 |
0.11% |
Xiaofeng Cao |
1 |
0.01% |
1 |
0.11% |
Michael Ellerman |
1 |
0.01% |
1 |
0.11% |
DaeSeok Youn |
1 |
0.01% |
1 |
0.11% |
Madhuparna Bhowmik |
1 |
0.01% |
1 |
0.11% |
Jason Low |
1 |
0.01% |
1 |
0.11% |
Kefeng Wang |
1 |
0.01% |
1 |
0.11% |
Total |
13217 |
|
909 |
|
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/fork.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
/*
* 'fork.c' contains the help-routines for the 'fork' system call
* (see also entry.S and others).
* Fork is rather simple, once you get the hang of it, but the memory
* management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
*/
#include <linux/anon_inodes.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/user.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/kmsan.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/compat.h>
#include <linux/kthread.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/proc_fs.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/ksm.h>
#include <linux/acct.h>
#include <linux/userfaultfd_k.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/tty.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
#include <linux/oom.h>
#include <linux/khugepaged.h>
#include <linux/signalfd.h>
#include <linux/uprobes.h>
#include <linux/aio.h>
#include <linux/compiler.h>
#include <linux/sysctl.h>
#include <linux/kcov.h>
#include <linux/livepatch.h>
#include <linux/thread_info.h>
#include <linux/stackleak.h>
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
#include <linux/stackprotector.h>
#include <linux/user_events.h>
#include <linux/iommu.h>
#include <linux/rseq.h>
#include <uapi/linux/pidfd.h>
#include <linux/pidfs.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <trace/events/sched.h>
#define CREATE_TRACE_POINTS
#include <trace/events/task.h>
/*
* Minimum number of threads to boot the kernel
*/
#define MIN_THREADS 20
/*
* Maximum number of threads
*/
#define MAX_THREADS FUTEX_TID_MASK
/*
* Protected counters by write_lock_irq(&tasklist_lock)
*/
unsigned long total_forks; /* Handle normal Linux uptimes. */
int nr_threads; /* The idle threads do not count.. */
static int max_threads; /* tunable limit on nr_threads */
#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
static const char * const resident_page_types[] = {
NAMED_ARRAY_INDEX(MM_FILEPAGES),
NAMED_ARRAY_INDEX(MM_ANONPAGES),
NAMED_ARRAY_INDEX(MM_SWAPENTS),
NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
};
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
#ifdef CONFIG_PROVE_RCU
int lockdep_tasklist_lock_is_held(void)
{
return lockdep_is_held(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
#endif /* #ifdef CONFIG_PROVE_RCU */
int nr_processes(void)
{
int cpu;
int total = 0;
for_each_possible_cpu(cpu)
total += per_cpu(process_counts, cpu);
return total;
}
void __weak arch_release_task_struct(struct task_struct *tsk)
{
}
static struct kmem_cache *task_struct_cachep;
static inline struct task_struct *alloc_task_struct_node(int node)
{
return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}
static inline void free_task_struct(struct task_struct *tsk)
{
kmem_cache_free(task_struct_cachep, tsk);
}
/*
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
* kmemcache based allocator.
*/
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
# ifdef CONFIG_VMAP_STACK
/*
* vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
* flush. Try to minimize the number of calls by caching stacks.
*/
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
struct vm_stack {
struct rcu_head rcu;
struct vm_struct *stack_vm_area;
};
static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
{
unsigned int i;
for (i = 0; i < NR_CACHED_STACKS; i++) {
if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
continue;
return true;
}
return false;
}
static void thread_stack_free_rcu(struct rcu_head *rh)
{
struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
return;
vfree(vm_stack);
}
static void thread_stack_delayed_free(struct task_struct *tsk)
{
struct vm_stack *vm_stack = tsk->stack;
vm_stack->stack_vm_area = tsk->stack_vm_area;
call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
}
static int free_vm_stack_cache(unsigned int cpu)
{
struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
int i;
for (i = 0; i < NR_CACHED_STACKS; i++) {
struct vm_struct *vm_stack = cached_vm_stacks[i];
if (!vm_stack)
continue;
vfree(vm_stack->addr);
cached_vm_stacks[i] = NULL;
}
return 0;
}
static int memcg_charge_kernel_stack(struct vm_struct *vm)
{
int i;
int ret;
int nr_charged = 0;
BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
if (ret)
goto err;
nr_charged++;
}
return 0;
err:
for (i = 0; i < nr_charged; i++)
memcg_kmem_uncharge_page(vm->pages[i], 0);
return ret;
}
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
struct vm_struct *vm;
void *stack;
int i;
for (i = 0; i < NR_CACHED_STACKS; i++) {
struct vm_struct *s;
s = this_cpu_xchg(cached_stacks[i], NULL);
if (!s)
continue;
/* Reset stack metadata. */
kasan_unpoison_range(s->addr, THREAD_SIZE);
stack = kasan_reset_tag(s->addr);
/* Clear stale pointers from reused stack. */
memset(stack, 0, THREAD_SIZE);
if (memcg_charge_kernel_stack(s)) {
vfree(s->addr);
return -ENOMEM;
}
tsk->stack_vm_area = s;
tsk->stack = stack;
return 0;
}
/*
* Allocated stacks are cached and later reused by new threads,
* so memcg accounting is performed manually on assigning/releasing
* stacks to tasks. Drop __GFP_ACCOUNT.
*/
stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
VMALLOC_START, VMALLOC_END,
THREADINFO_GFP & ~__GFP_ACCOUNT,
PAGE_KERNEL,
0, node, __builtin_return_address(0));
if (!stack)
return -ENOMEM;
vm = find_vm_area(stack);
if (memcg_charge_kernel_stack(vm)) {
vfree(stack);
return -ENOMEM;
}
/*
* We can't call find_vm_area() in interrupt context, and
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
tsk->stack_vm_area = vm;
stack = kasan_reset_tag(stack);
tsk->stack = stack;
return 0;
}
static void free_thread_stack(struct task_struct *tsk)
{
if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
thread_stack_delayed_free(tsk);
tsk->stack = NULL;
tsk->stack_vm_area = NULL;
}
# else /* !CONFIG_VMAP_STACK */
static void thread_stack_free_rcu(struct rcu_head *rh)
{
__free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
}
static void thread_stack_delayed_free(struct task_struct *tsk)
{
struct rcu_head *rh = tsk->stack;
call_rcu(rh, thread_stack_free_rcu);
}
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
if (likely(page)) {
tsk->stack = kasan_reset_tag(page_address(page));
return 0;
}
return -ENOMEM;
}
static void free_thread_stack(struct task_struct *tsk)
{
thread_stack_delayed_free(tsk);
tsk->stack = NULL;
}
# endif /* CONFIG_VMAP_STACK */
# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
static struct kmem_cache *thread_stack_cache;
static void thread_stack_free_rcu(struct rcu_head *rh)
{
kmem_cache_free(thread_stack_cache, rh);
}
static void thread_stack_delayed_free(struct task_struct *tsk)
{
struct rcu_head *rh = tsk->stack;
call_rcu(rh, thread_stack_free_rcu);
}
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
unsigned long *stack;
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
stack = kasan_reset_tag(stack);
tsk->stack = stack;
return stack ? 0 : -ENOMEM;
}
static void free_thread_stack(struct task_struct *tsk)
{
thread_stack_delayed_free(tsk);
tsk->stack = NULL;
}
void thread_stack_cache_init(void)
{
thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
THREAD_SIZE, THREAD_SIZE, 0, 0,
THREAD_SIZE, NULL);
BUG_ON(thread_stack_cache == NULL);
}
# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
/* SLAB cache for sighand_struct structures (tsk->sighand) */
struct kmem_cache *sighand_cachep;
/* SLAB cache for files_struct structures (tsk->files) */
struct kmem_cache *files_cachep;
/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;
/* SLAB cache for vm_area_struct structures */
static struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
#ifdef CONFIG_PER_VMA_LOCK
/* SLAB cache for vm_area_struct.lock */
static struct kmem_cache *vma_lock_cachep;
static bool vma_lock_alloc(struct vm_area_struct *vma)
{
vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
if (!vma->vm_lock)
return false;
init_rwsem(&vma->vm_lock->lock);
vma->vm_lock_seq = -1;
return true;
}
static inline void vma_lock_free(struct vm_area_struct *vma)
{
kmem_cache_free(vma_lock_cachep, vma->vm_lock);
}
#else /* CONFIG_PER_VMA_LOCK */
static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
static inline void vma_lock_free(struct vm_area_struct *vma) {}
#endif /* CONFIG_PER_VMA_LOCK */
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
struct vm_area_struct *vma;
vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!vma)
return NULL;
vma_init(vma, mm);
if (!vma_lock_alloc(vma)) {
kmem_cache_free(vm_area_cachep, vma);
return NULL;
}
return vma;
}
struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
return NULL;
ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
/*
* orig->shared.rb may be modified concurrently, but the clone
* will be reinitialized.
*/
data_race(memcpy(new, orig, sizeof(*new)));
if (!vma_lock_alloc(new)) {
kmem_cache_free(vm_area_cachep, new);
return NULL;
}
INIT_LIST_HEAD(&new->anon_vma_chain);
vma_numab_state_init(new);
dup_anon_vma_name(orig, new);
return new;
}
void __vm_area_free(struct vm_area_struct *vma)
{
vma_numab_state_free(vma);
free_anon_vma_name(vma);
vma_lock_free(vma);
kmem_cache_free(vm_area_cachep, vma);
}
#ifdef CONFIG_PER_VMA_LOCK
static void vm_area_free_rcu_cb(struct rcu_head *head)
{
struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
vm_rcu);
/* The vma should not be locked while being destroyed. */
VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
__vm_area_free(vma);
}
#endif
void vm_area_free(struct vm_area_struct *vma)
{
#ifdef CONFIG_PER_VMA_LOCK
call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
#else
__vm_area_free(vma);
#endif
}
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
struct vm_struct *vm = task_stack_vm_area(tsk);
int i;
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
account * (PAGE_SIZE / 1024));
} else {
void *stack = task_stack_page(tsk);
/* All stack pages are in the same node. */
mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
account * (THREAD_SIZE / 1024));
}
}
void exit_task_stack_account(struct task_struct *tsk)
{
account_kernel_stack(tsk, -1);
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
struct vm_struct *vm;
int i;
vm = task_stack_vm_area(tsk);
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
memcg_kmem_uncharge_page(vm->pages[i], 0);
}
}
static void release_task_stack(struct task_struct *tsk)
{
if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
return; /* Better to leak the stack than to free prematurely */
free_thread_stack(tsk);
}
#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
if (refcount_dec_and_test(&tsk->stack_refcount))
release_task_stack(tsk);
}
#endif
void free_task(struct task_struct *tsk)
{
#ifdef CONFIG_SECCOMP
WARN_ON_ONCE(tsk->seccomp.filter);
#endif
release_user_cpus_ptr(tsk);
scs_release(tsk);
#ifndef CONFIG_THREAD_INFO_IN_TASK
/*
* The task is finally done with both the stack and thread_info,
* so free both.
*/
release_task_stack(tsk);
#else
/*
* If the task had a separate stack allocation, it should be gone
* by now.
*/
WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
#endif
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
arch_release_task_struct(tsk);
if (tsk->flags & PF_KTHREAD)
free_kthread_struct(tsk);
bpf_task_storage_free(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
{
struct file *exe_file;
exe_file = get_mm_exe_file(oldmm);
RCU_INIT_POINTER(mm->exe_file, exe_file);
/*
* We depend on the oldmm having properly denied write access to the
* exe_file already.
*/
if (exe_file && deny_write_access(exe_file))
pr_warn_once("deny_write_access() failed in %s\n", __func__);
}
#ifdef CONFIG_MMU
static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct mm_struct *oldmm)
{
struct vm_area_struct *mpnt, *tmp;
int retval;
unsigned long charge = 0;
LIST_HEAD(uf);
VMA_ITERATOR(vmi, mm, 0);
uprobe_start_dup_mmap();
if (mmap_write_lock_killable(oldmm)) {
retval = -EINTR;
goto fail_uprobe_end;
}
flush_cache_dup_mm(oldmm);
uprobe_dup_mmap(oldmm, mm);
/*
* Not linked in yet - no deadlock potential:
*/
mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
/* No ordering required: file already has been exposed. */
dup_mm_exe_file(mm, oldmm);
mm->total_vm = oldmm->total_vm;
mm->data_vm = oldmm->data_vm;
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;
retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
khugepaged_fork(mm, oldmm);
/* Use __mt_dup() to efficiently build an identical maple tree. */
retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
if (unlikely(retval))
goto out;
mt_clear_in_rcu(vmi.mas.tree);
for_each_vma(vmi, mpnt) {
struct file *file;
vma_start_write(mpnt);
if (mpnt->vm_flags & VM_DONTCOPY) {
retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
mpnt->vm_end, GFP_KERNEL);
if (retval)
goto loop_out;
vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
continue;
}
charge = 0;
/*
* Don't duplicate many vmas if we've been oom-killed (for
* example)
*/
if (fatal_signal_pending(current)) {
retval = -EINTR;
goto loop_out;
}
if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned long len = vma_pages(mpnt);
if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
goto fail_nomem;
charge = len;
}
tmp = vm_area_dup(mpnt);
if (!tmp)
goto fail_nomem;
retval = vma_dup_policy(mpnt, tmp);
if (retval)
goto fail_nomem_policy;
tmp->vm_mm = mm;
retval = dup_userfaultfd(tmp, &uf);
if (retval)
goto fail_nomem_anon_vma_fork;
if (tmp->vm_flags & VM_WIPEONFORK) {
/*
* VM_WIPEONFORK gets a clean slate in the child.
* Don't prepare anon_vma until fault since we don't
* copy page for current vma.
*/
tmp->anon_vma = NULL;
} else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
vm_flags_clear(tmp, VM_LOCKED_MASK);
/*
* Copy/update hugetlb private vma information.
*/
if (is_vm_hugetlb_page(tmp))
hugetlb_dup_vma_private(tmp);
/*
* Link the vma into the MT. After using __mt_dup(), memory
* allocation is not necessary here, so it cannot fail.
*/
vma_iter_bulk_store(&vmi, tmp);
mm->map_count++;
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
file = tmp->vm_file;
if (file) {
struct address_space *mapping = file->f_mapping;
get_file(file);
i_mmap_lock_write(mapping);
if (vma_is_shared_maywrite(tmp))
mapping_allow_writable(mapping);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
vma_interval_tree_insert_after(tmp, mpnt,
&mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
i_mmap_unlock_write(mapping);
}
if (!(tmp->vm_flags & VM_WIPEONFORK))
retval = copy_page_range(tmp, mpnt);
if (retval) {
mpnt = vma_next(&vmi);
goto loop_out;
}
}
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
loop_out:
vma_iter_free(&vmi);
if (!retval) {
mt_set_in_rcu(vmi.mas.tree);
} else if (mpnt) {
/*
* The entire maple tree has already been duplicated. If the
* mmap duplication fails, mark the failure point with
* XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
* stop releasing VMAs that have not been duplicated after this
* point.
*/
mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
mas_store(&vmi.mas, XA_ZERO_ENTRY);
}
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
mmap_write_unlock(oldmm);
dup_userfaultfd_complete(&uf);
fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
fail_nomem_anon_vma_fork:
mpol_put(vma_policy(tmp));
fail_nomem_policy:
vm_area_free(tmp);
fail_nomem:
retval = -ENOMEM;
vm_unacct_memory(charge);
goto loop_out;
}
static inline int mm_alloc_pgd(struct mm_struct *mm)
{
mm->pgd = pgd_alloc(mm);
if (unlikely(!mm->pgd))
return -ENOMEM;
return 0;
}
static inline void mm_free_pgd(struct mm_struct *mm)
{
pgd_free(mm, mm->pgd);
}
#else
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
mmap_write_lock(oldmm);
dup_mm_exe_file(mm, oldmm);
mmap_write_unlock(oldmm);
return 0;
}
#define mm_alloc_pgd(mm) (0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */
static void check_mm(struct mm_struct *mm)
{
int i;
BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
"Please make sure 'struct resident_page_types[]' is updated as well");
for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = percpu_counter_sum(&mm->rss_stat[i]);
if (unlikely(x))
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
mm, resident_page_types[i], x);
}
if (mm_pgtables_bytes(mm))
pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
mm_pgtables_bytes(mm));
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
static void do_check_lazy_tlb(void *arg)
{
struct mm_struct *mm = arg;
WARN_ON_ONCE(current->active_mm == mm);
}
static void do_shoot_lazy_tlb(void *arg)
{
struct mm_struct *mm = arg;
if (current->active_mm == mm) {
WARN_ON_ONCE(current->mm);
current->active_mm = &init_mm;
switch_mm(mm, &init_mm, current);
}
}
static void cleanup_lazy_tlbs(struct mm_struct *mm)
{
if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
/*
* In this case, lazy tlb mms are refounted and would not reach
* __mmdrop until all CPUs have switched away and mmdrop()ed.
*/
return;
}
/*
* Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
* requires lazy mm users to switch to another mm when the refcount
* drops to zero, before the mm is freed. This requires IPIs here to
* switch kernel threads to init_mm.
*
* archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
* switch with the final userspace teardown TLB flush which leaves the
* mm lazy on this CPU but no others, reducing the need for additional
* IPIs here. There are cases where a final IPI is still required here,
* such as the final mmdrop being performed on a different CPU than the
* one exiting, or kernel threads using the mm when userspace exits.
*
* IPI overheads have not found to be expensive, but they could be
* reduced in a number of possible ways, for example (roughly
* increasing order of complexity):
* - The last lazy reference created by exit_mm() could instead switch
* to init_mm, however it's probable this will run on the same CPU
* immediately afterwards, so this may not reduce IPIs much.
* - A batch of mms requiring IPIs could be gathered and freed at once.
* - CPUs store active_mm where it can be remotely checked without a
* lock, to filter out false-positives in the cpumask.
* - After mm_users or mm_count reaches zero, switching away from the
* mm could clear mm_cpumask to reduce some IPIs, perhaps together
* with some batching or delaying of the final IPIs.
* - A delayed freeing and RCU-like quiescing sequence based on mm
* switching to avoid IPIs completely.
*/
on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
}
/*
* Called when the last reference to the mm
* is dropped: either by a lazy thread or by
* mmput. Free the page directory and the mm.
*/
void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
/* Ensure no CPUs are using this as their lazy tlb mm */
cleanup_lazy_tlbs(mm);
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_subscriptions_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
mm_destroy_cid(mm);
percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
static void mmdrop_async_fn(struct work_struct *work)
{
struct mm_struct *mm;
mm = container_of(work, struct mm_struct, async_put_work);
__mmdrop(mm);
}
static void mmdrop_async(struct mm_struct *mm)
{
if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
schedule_work(&mm->async_put_work);
}
}
static inline void free_signal_struct(struct signal_struct *sig)
{
taskstats_tgid_free(sig);
sched_autogroup_exit(sig);
/*
* __mmdrop is not safe to call from softirq context on x86 due to
* pgd_dtor so postpone it to the async context
*/
if (sig->oom_mm)
mmdrop_async(sig->oom_mm);
kmem_cache_free(signal_cachep, sig);
}
static inline void put_signal_struct(struct signal_struct *sig)
{
if (refcount_dec_and_test(&sig->sigcnt))
free_signal_struct(sig);
}
void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
io_uring_free(tsk);
cgroup_free(tsk);
task_numa_free(tsk, true);
security_task_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
sched_core_free(tsk);
free_task(tsk);
}
EXPORT_SYMBOL_GPL(__put_task_struct);
void __put_task_struct_rcu_cb(struct rcu_head *rhp)
{
struct task_struct *task = container_of(rhp, struct task_struct, rcu);
__put_task_struct(task);
}
EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
void __init __weak arch_task_cache_init(void) { }
/*
* set_max_threads
*/
static void set_max_threads(unsigned int max_threads_suggested)
{
u64 threads;
unsigned long nr_pages = totalram_pages();
/*
* The number of threads shall be limited such that the thread
* structures may only consume a small part of the available memory.
*/
if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
threads = MAX_THREADS;
else
threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
(u64) THREAD_SIZE * 8UL);
if (threads > max_threads_suggested)
threads = max_threads_suggested;
max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
}
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
/* Initialized by the architecture: */
int arch_task_struct_size __read_mostly;
#endif
static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
/* Fetch thread_struct whitelist for the architecture. */
arch_thread_struct_whitelist(offset, size);
/*
* Handle zero-sized whitelist or empty thread_struct, otherwise
* adjust offset to position of thread_struct in task_struct.
*/
if (unlikely(*size == 0))
*offset = 0;
else
*offset += offsetof(struct task_struct, thread);
}
void __init fork_init(void)
{
int i;
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN 0
#endif
int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
unsigned long useroffset, usersize;
/* create a slab on which task_structs can be allocated */
task_struct_whitelist(&useroffset, &usersize);
task_struct_cachep = kmem_cache_create_usercopy("task_struct",
arch_task_struct_size, align,
SLAB_PANIC|SLAB_ACCOUNT,
useroffset, usersize, NULL);
/* do the arch specific task caches init */
arch_task_cache_init();
set_max_threads(MAX_THREADS);
init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
init_task.signal->rlim[RLIMIT_SIGPENDING] =
init_task.signal->rlim[RLIMIT_NPROC];
for (i = 0; i < UCOUNT_COUNTS; i++)
init_user_ns.ucount_max[i] = max_threads/2;
set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
#ifdef CONFIG_VMAP_STACK
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
NULL, free_vm_stack_cache);
#endif
scs_init();
lockdep_init_task(&init_task);
uprobes_init();
}
int __weak arch_dup_task_struct(struct task_struct *dst,
struct task_struct *src)
{
*dst = *src;
return 0;
}
void set_task_stack_end_magic(struct task_struct *tsk)
{
unsigned long *stackend;
stackend = end_of_stack(tsk);
*stackend = STACK_END_MAGIC; /* for overflow detection */
}
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
int err;
if (node == NUMA_NO_NODE)
node = tsk_fork_get_node(orig);
tsk = alloc_task_struct_node(node);
if (!tsk)
return NULL;
err = arch_dup_task_struct(tsk, orig);
if (err)
goto free_tsk;
err = alloc_thread_stack_node(tsk, node);
if (err)
goto free_tsk;
#ifdef CONFIG_THREAD_INFO_IN_TASK
refcount_set(&tsk->stack_refcount, 1);
#endif
account_kernel_stack(tsk, 1);
err = scs_prepare(tsk, node);
if (err)
goto free_stack;
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
* the sighand lock in case orig has changed between now and
* then. Until then, filter must be NULL to avoid messing up
* the usage counts on the error path calling free_task.
*/
tsk->seccomp.filter = NULL;
#endif
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
set_task_stack_end_magic(tsk);
clear_syscall_work_syscall_user_dispatch(tsk);
#ifdef CONFIG_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
#endif
if (orig->cpus_ptr == &orig->cpus_mask)
tsk->cpus_ptr = &tsk->cpus_mask;
dup_user_cpus_ptr(tsk, orig, node);
/*
* One for the user space visible state that goes away when reaped.
* One for the scheduler.
*/
refcount_set(&tsk->rcu_users, 2);
/* One for the rcu users */
refcount_set(&tsk->usage, 1);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
tsk->worker_private = NULL;
kcov_task_init(tsk);
kmsan_task_create(tsk);
kmap_local_fork(tsk);
#ifdef CONFIG_FAULT_INJECTION
tsk->fail_nth = 0;
#endif
#ifdef CONFIG_BLK_CGROUP
tsk->throttle_disk = NULL;
tsk->use_memdelay = 0;
#endif
#ifdef CONFIG_ARCH_HAS_CPU_PASID
tsk->pasid_activated = 0;
#endif
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
#endif
#ifdef CONFIG_CPU_SUP_INTEL
tsk->reported_split_lock = 0;
#endif
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid = -1;
tsk->last_mm_cid = -1;
tsk->mm_cid_active = 0;
tsk->migrate_from_cpu = -1;
#endif
return tsk;
free_stack:
exit_task_stack_account(tsk);
free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
return NULL;
}
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
static int __init coredump_filter_setup(char *s)
{
default_dump_filter =
(simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
MMF_DUMP_FILTER_MASK;
return 1;
}
__setup("coredump_filter=", coredump_filter_setup);
#include <linux/init_task.h>
static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
spin_lock_init(&mm->ioctx_lock);
mm->ioctx_table = NULL;
#endif
}
static __always_inline void mm_clear_owner(struct mm_struct *mm,
struct task_struct *p)
{
#ifdef CONFIG_MEMCG
if (mm->owner == p)
WRITE_ONCE(mm->owner, NULL);
#endif
}
static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
mm->owner = p;
#endif
}
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
mm->uprobes_state.xol_area = NULL;
#endif
}
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
#ifdef CONFIG_PER_VMA_LOCK
mm->mm_lock_seq = 0;
#endif
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
atomic64_set(&mm->pinned_vm, 0);
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
spin_lock_init(&mm->arg_lock);
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
mm_pasid_init(mm);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
#endif
mm_init_uprobes_state(mm);
hugetlb_count_init(mm);
if (current->mm) {
mm->flags = mmf_init_flags(current->mm->flags);
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
} else {
mm->flags = default_dump_filter;
mm->def_flags = 0;
}
if (mm_alloc_pgd(mm))
goto fail_nopgd;
if (init_new_context(p, mm))
goto fail_nocontext;
if (mm_alloc_cid(mm))
goto fail_cid;
if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
NR_MM_COUNTERS))
goto fail_pcpu;
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
return mm;
fail_pcpu:
mm_destroy_cid(mm);
fail_cid:
destroy_context(mm);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
free_mm(mm);
return NULL;
}
/*
* Allocate and initialize an mm_struct.
*/
struct mm_struct *mm_alloc(void)
{
struct mm_struct *mm;
mm = allocate_mm();
if (!mm)
return NULL;
memset(mm, 0, sizeof(*mm));
return mm_init(mm, current, current_user_ns());
}
static inline void __mmput(struct mm_struct *mm)
{
VM_BUG_ON(atomic_read(&mm->mm_users));
uprobe_clear_state(mm);
exit_aio(mm);
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
mm_put_huge_zero_folio(mm);
set_mm_exe_file(mm, NULL);
if (!list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
list_del(&mm->mmlist);
spin_unlock(&mmlist_lock);
}
if (mm->binfmt)
module_put(mm->binfmt->module);
lru_gen_del_mm(mm);
mmdrop(mm);
}
/*
* Decrement the use count and release all resources for an mm.
*/
void mmput(struct mm_struct *mm)
{
might_sleep();
if (atomic_dec_and_test(&mm->mm_users))
__mmput(mm);
}
EXPORT_SYMBOL_GPL(mmput);
#ifdef CONFIG_MMU
static void mmput_async_fn(struct work_struct *work)
{
struct mm_struct *mm = container_of(work, struct mm_struct,
async_put_work);
__mmput(mm);
}
void mmput_async(struct mm_struct *mm)
{
if (atomic_dec_and_test(&mm->mm_users)) {
INIT_WORK(&mm->async_put_work, mmput_async_fn);
schedule_work(&mm->async_put_work);
}
}
EXPORT_SYMBOL_GPL(mmput_async);
#endif
/**
* set_mm_exe_file - change a reference to the mm's executable file
* @mm: The mm to change.
* @new_exe_file: The new file to use.
*
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
* Main users are mmput() and sys_execve(). Callers prevent concurrent
* invocations: in mmput() nobody alive left, in execve it happens before
* the new mm is made visible to anyone.
*
* Can only fail if new_exe_file != NULL.
*/
int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
struct file *old_exe_file;
/*
* It is safe to dereference the exe_file without RCU as
* this function is only called if nobody else can access
* this mm -- see comment above for justification.
*/
old_exe_file = rcu_dereference_raw(mm->exe_file);
if (new_exe_file) {
/*
* We expect the caller (i.e., sys_execve) to already denied
* write access, so this is unlikely to fail.
*/
if (unlikely(deny_write_access(new_exe_file)))
return -EACCES;
get_file(new_exe_file);
}
rcu_assign_pointer(mm->exe_file, new_exe_file);
if (old_exe_file) {
allow_write_access(old_exe_file);
fput(old_exe_file);
}
return 0;
}
/**
* replace_mm_exe_file - replace a reference to the mm's executable file
* @mm: The mm to change.
* @new_exe_file: The new file to use.
*
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
* Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
*/
int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
struct vm_area_struct *vma;
struct file *old_exe_file;
int ret = 0;
/* Forbid mm->exe_file change if old file still mapped. */
old_exe_file = get_mm_exe_file(mm);
if (old_exe_file) {
VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
if (path_equal(&vma->vm_file->f_path,
&old_exe_file->f_path)) {
ret = -EBUSY;
break;
}
}
mmap_read_unlock(mm);
fput(old_exe_file);
if (ret)
return ret;
}
ret = deny_write_access(new_exe_file);
if (ret)
return -EACCES;
get_file(new_exe_file);
/* set the new file */
mmap_write_lock(mm);
old_exe_file = rcu_dereference_raw(mm->exe_file);
rcu_assign_pointer(mm->exe_file, new_exe_file);
mmap_write_unlock(mm);
if (old_exe_file) {
allow_write_access(old_exe_file);
fput(old_exe_file);
}
return 0;
}
/**
* get_mm_exe_file - acquire a reference to the mm's executable file
* @mm: The mm of interest.
*
* Returns %NULL if mm has no associated executable file.
* User must release file via fput().
*/
struct file *get_mm_exe_file(struct mm_struct *mm)
{
struct file *exe_file;
rcu_read_lock();
exe_file = get_file_rcu(&mm->exe_file);
rcu_read_unlock();
return exe_file;
}
/**
* get_task_exe_file - acquire a reference to the task's executable file
* @task: The task.
*
* Returns %NULL if task's mm (if any) has no associated executable file or
* this is a kernel thread with borrowed mm (see the comment above get_task_mm).
* User must release file via fput().
*/
struct file *get_task_exe_file(struct task_struct *task)
{
struct file *exe_file = NULL;
struct mm_struct *mm;
task_lock(task);
mm = task->mm;
if (mm) {
if (!(task->flags & PF_KTHREAD))
exe_file = get_mm_exe_file(mm);
}
task_unlock(task);
return exe_file;
}
/**
* get_task_mm - acquire a reference to the task's mm
* @task: The task.
*
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
* this kernel workthread has transiently adopted a user mm with use_mm,
* to do its AIO) is not set and if so returns a reference to it, after
* bumping up the use count. User must release the mm via mmput()
* after use. Typically used by /proc and ptrace.
*/
struct mm_struct *get_task_mm(struct task_struct *task)
{
struct mm_struct *mm;
task_lock(task);
mm = task->mm;
if (mm) {
if (task->flags & PF_KTHREAD)
mm = NULL;
else
mmget(mm);
}
task_unlock(task);
return mm;
}
EXPORT_SYMBOL_GPL(get_task_mm);
struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
{
struct mm_struct *mm;
int err;
err = down_read_killable(&task->signal->exec_update_lock);
if (err)
return ERR_PTR(err);
mm = get_task_mm(task);
if (mm && mm != current->mm &&
!ptrace_may_access(task, mode)) {
mmput(mm);
mm = ERR_PTR(-EACCES);
}
up_read(&task->signal->exec_update_lock);
return mm;
}
static void complete_vfork_done(struct task_struct *tsk)
{
struct completion *vfork;
task_lock(tsk);
vfork = tsk->vfork_done;
if (likely(vfork)) {
tsk->vfork_done = NULL;
complete(vfork);
}
task_unlock(tsk);
}
static int wait_for_vfork_done(struct task_struct *child,
struct completion *vfork)
{
unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
int killed;
cgroup_enter_frozen();
killed = wait_for_completion_state(vfork, state);
cgroup_leave_frozen(false);
if (killed) {
task_lock(child);
child->vfork_done = NULL;
task_unlock(child);
}
put_task_struct(child);
return killed;
}
/* Please note the differences between mmput and mm_release.
* mmput is called whenever we stop holding onto a mm_struct,
* error success whatever.
*
* mm_release is called after a mm_struct has been removed
* from the current process.
*
* This difference is important for error handling, when we
* only half set up a mm_struct for a new process and need to restore
* the old one. Because we mmput the new mm_struct before
* restoring the old one. . .
* Eric Biederman 10 January 1998
*/
static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
uprobe_free_utask(tsk);
/* Get rid of any cached register state */
deactivate_mm(tsk, mm);
/*
* Signal userspace if we're not exiting with a core dump
* because we want to leave the value intact for debugging
* purposes.
*/
if (tsk->clear_child_tid) {
if (atomic_read(&mm->mm_users) > 1) {
/*
* We don't check the error code - if userspace has
* not set up a proper pointer then tough luck.
*/
put_user(0, tsk->clear_child_tid);
do_futex(tsk->clear_child_tid, FUTEX_WAKE,
1, NULL, NULL, 0, 0);
}
tsk->clear_child_tid = NULL;
}
/*
* All done, finally we can wake up parent and return this mm to him.
* Also kthread_stop() uses this completion for synchronization.
*/
if (tsk->vfork_done)
complete_vfork_done(tsk);
}
void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
futex_exit_release(tsk);
mm_release(tsk, mm);
}
void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
futex_exec_release(tsk);
mm_release(tsk, mm);
}
/**
* dup_mm() - duplicates an existing mm structure
* @tsk: the task_struct with which the new mm will be associated.
* @oldmm: the mm to duplicate.
*
* Allocates a new mm structure and duplicates the provided @oldmm structure
* content into it.
*
* Return: the duplicated mm or NULL on failure.
*/
static struct mm_struct *dup_mm(struct task_struct *tsk,
struct mm_struct *oldmm)
{
struct mm_struct *mm;
int err;
mm = allocate_mm();
if (!mm)
goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm));
if (!mm_init(mm, tsk, mm->user_ns))
goto fail_nomem;
err = dup_mmap(mm, oldmm);
if (err)
goto free_pt;
mm->hiwater_rss = get_mm_rss(mm);
mm->hiwater_vm = mm->total_vm;
if (mm->binfmt && !try_module_get(mm->binfmt->module))
goto free_pt;
return mm;
free_pt:
/* don't put binfmt in mmput, we haven't got module yet */
mm->binfmt = NULL;
mm_init_owner(mm, NULL);
mmput(mm);
fail_nomem:
return NULL;
}
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm;
tsk->min_flt = tsk->maj_flt = 0;
tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
tsk->last_switch_time = 0;
#endif
tsk->mm = NULL;
tsk->active_mm = NULL;
/*
* Are we cloning a kernel thread?
*
* We need to steal a active VM for that..
*/
oldmm = current->mm;
if (!oldmm)
return 0;
if (clone_flags & CLONE_VM) {
mmget(oldmm);
mm = oldmm;
} else {
mm = dup_mm(tsk, current->mm);
if (!mm)
return -ENOMEM;
}
tsk->mm = mm;
tsk->active_mm = mm;
sched_mm_cid_fork(tsk);
return 0;
}
static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
{
struct fs_struct *fs = current->fs;
if (clone_flags & CLONE_FS) {
/* tsk->fs is already what we want */
spin_lock(&fs->lock);
/* "users" and "in_exec" locked for check_unsafe_exec() */
if (fs->in_exec) {
spin_unlock(&fs->lock);
return -EAGAIN;
}
fs->users++;
spin_unlock(&fs->lock);
return 0;
}
tsk->fs = copy_fs_struct(fs);
if (!tsk->fs)
return -ENOMEM;
return 0;
}
static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
int no_files)
{
struct files_struct *oldf, *newf;
int error = 0;
/*
* A background process may not have any files ...
*/
oldf = current->files;
if (!oldf)
goto out;
if (no_files) {
tsk->files = NULL;
goto out;
}
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
goto out;
}
newf = dup_fd(oldf, NR_OPEN_MAX, &error);
if (!newf)
goto out;
tsk->files = newf;
error = 0;
out:
return error;
}
static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
{
struct sighand_struct *sig;
if (clone_flags & CLONE_SIGHAND) {
refcount_inc(¤t->sighand->count);
return 0;
}
sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
RCU_INIT_POINTER(tsk->sighand, sig);
if (!sig)
return -ENOMEM;
refcount_set(&sig->count, 1);
spin_lock_irq(¤t->sighand->siglock);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
spin_unlock_irq(¤t->sighand->siglock);
/* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
if (clone_flags & CLONE_CLEAR_SIGHAND)
flush_signal_handlers(tsk, 0);
return 0;
}
void __cleanup_sighand(struct sighand_struct *sighand)
{
if (refcount_dec_and_test(&sighand->count)) {
signalfd_cleanup(sighand);
/*
* sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
* without an RCU grace period, see __lock_task_sighand().
*/
kmem_cache_free(sighand_cachep, sighand);
}
}
/*
* Initialize POSIX timer handling for a thread group.
*/
static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
struct posix_cputimers *pct = &sig->posix_cputimers;
unsigned long cpu_limit;
cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
posix_cputimers_group_init(pct, cpu_limit);
}
static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
struct signal_struct *sig;
if (clone_flags & CLONE_THREAD)
return 0;
sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
tsk->signal = sig;
if (!sig)
return -ENOMEM;
sig->nr_threads = 1;
sig->quick_threads = 1;
atomic_set(&sig->live, 1);
refcount_set(&sig->sigcnt, 1);
/* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
init_waitqueue_head(&sig->wait_chldexit);
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
INIT_HLIST_HEAD(&sig->multiprocess);
seqlock_init(&sig->stats_lock);
prev_cputime_init(&sig->prev_cputime);
#ifdef CONFIG_POSIX_TIMERS
INIT_LIST_HEAD(&sig->posix_timers);
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->real_timer.function = it_real_fn;
#endif
task_lock(current->group_leader);
memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
task_unlock(current->group_leader);
posix_cpu_timers_init_group(sig);
tty_audit_fork(sig);
sched_autogroup_fork(sig);
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
mutex_init(&sig->cred_guard_mutex);
init_rwsem(&sig->exec_update_lock);
return 0;
}
static void copy_seccomp(struct task_struct *p)
{
#ifdef CONFIG_SECCOMP
/*
* Must be called with sighand->lock held, which is common to
* all threads in the group. Holding cred_guard_mutex is not
* needed because this new task is not yet running and cannot
* be racing exec.
*/
assert_spin_locked(¤t->sighand->siglock);
/* Ref-count the new filter user, and assign it. */
get_seccomp_filter(current);
p->seccomp = current->seccomp;
/*
* Explicitly enable no_new_privs here in case it got set
* between the task_struct being duplicated and holding the
* sighand lock. The seccomp state and nnp must be in sync.
*/
if (task_no_new_privs(current))
task_set_no_new_privs(p);
/*
* If the parent gained a seccomp mode after copying thread
* flags and between before we held the sighand lock, we have
* to manually enable the seccomp thread flag here.
*/
if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
set_task_syscall_work(p, SECCOMP);
#endif
}
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
current->clear_child_tid = tidptr;
return task_pid_vnr(current);
}
static void rt_mutex_init_task(struct task_struct *p)
{
raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
p->pi_waiters = RB_ROOT_CACHED;
p->pi_top_task = NULL;
p->pi_blocked_on = NULL;
#endif
}
static inline void init_task_pid_links(struct task_struct *task)
{
enum pid_type type;
for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
INIT_HLIST_NODE(&task->pid_links[type]);
}
static inline void
init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
{
if (type == PIDTYPE_PID)
task->thread_pid = pid;
else
task->signal->pids[type] = pid;
}
static inline void rcu_copy_process(struct task_struct *p)
{
#ifdef CONFIG_PREEMPT_RCU
p->rcu_read_lock_nesting = 0;
p->rcu_read_unlock_special.s = 0;
p->rcu_blocked_node = NULL;
INIT_LIST_HEAD(&p->rcu_node_entry);
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
p->rcu_tasks_holdout = false;
INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
p->rcu_tasks_idle_cpu = -1;
INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
p->trc_reader_nesting = 0;
p->trc_reader_special.s = 0;
INIT_LIST_HEAD(&p->trc_holdout_list);
INIT_LIST_HEAD(&p->trc_blkd_node);
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
/**
* __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
* @pid: the struct pid for which to create a pidfd
* @flags: flags of the new @pidfd
* @ret: Where to return the file for the pidfd.
*
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
*
* The helper doesn't perform checks on @pid which makes it useful for pidfds
* created via CLONE_PIDFD where @pid has no task attached when the pidfd and
* pidfd file are prepared.
*
* If this function returns successfully the caller is responsible to either
* call fd_install() passing the returned pidfd and pidfd file as arguments in
* order to install the pidfd into its file descriptor table or they must use
* put_unused_fd() and fput() on the returned pidfd and pidfd file
* respectively.
*
* This function is useful when a pidfd must already be reserved but there
* might still be points of failure afterwards and the caller wants to ensure
* that no pidfd is leaked into its file descriptor table.
*
* Return: On success, a reserved pidfd is returned from the function and a new
* pidfd file is returned in the last argument to the function. On
* error, a negative error code is returned from the function and the
* last argument remains unchanged.
*/
static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
int pidfd;
struct file *pidfd_file;
pidfd = get_unused_fd_flags(O_CLOEXEC);
if (pidfd < 0)
return pidfd;
pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
if (IS_ERR(pidfd_file)) {
put_unused_fd(pidfd);
return PTR_ERR(pidfd_file);
}
/*
* anon_inode_getfile() ignores everything outside of the
* O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
*/
pidfd_file->f_flags |= (flags & PIDFD_THREAD);
*ret = pidfd_file;
return pidfd;
}
/**
* pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
* @pid: the struct pid for which to create a pidfd
* @flags: flags of the new @pidfd
* @ret: Where to return the pidfd.
*
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
*
* The helper verifies that @pid is still in use, without PIDFD_THREAD the
* task identified by @pid must be a thread-group leader.
*
* If this function returns successfully the caller is responsible to either
* call fd_install() passing the returned pidfd and pidfd file as arguments in
* order to install the pidfd into its file descriptor table or they must use
* put_unused_fd() and fput() on the returned pidfd and pidfd file
* respectively.
*
* This function is useful when a pidfd must already be reserved but there
* might still be points of failure afterwards and the caller wants to ensure
* that no pidfd is leaked into its file descriptor table.
*
* Return: On success, a reserved pidfd is returned from the function and a new
* pidfd file is returned in the last argument to the function. On
* error, a negative error code is returned from the function and the
* last argument remains unchanged.
*/
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
bool thread = flags & PIDFD_THREAD;
if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
return -EINVAL;
return __pidfd_prepare(pid, flags, ret);
}
static void __delayed_free_task(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
free_task(tsk);
}
static __always_inline void delayed_free_task(struct task_struct *tsk)
{
if (IS_ENABLED(CONFIG_MEMCG))
call_rcu(&tsk->rcu, __delayed_free_task);
else
free_task(tsk);
}
static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
{
/* Skip if kernel thread */
if (!tsk->mm)
return;
/* Skip if spawning a thread or using vfork */
if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
return;
/* We need to synchronize with __set_oom_adj */
mutex_lock(&oom_adj_mutex);
set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
/* Update the values in case they were changed after copy_signal */
tsk->signal->oom_score_adj = current->signal->oom_score_adj;
tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
mutex_unlock(&oom_adj_mutex);
}
#ifdef CONFIG_RV
static void rv_task_fork(struct task_struct *p)
{
int i;
for (i = 0; i < RV_PER_TASK_MONITORS; i++)
p->rv[i].da_mon.monitoring = false;
}
#else
#define rv_task_fork(p) do {} while (0)
#endif
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
*
* It copies the registers, and all the appropriate
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
__latent_entropy struct task_struct *copy_process(
struct pid *pid,
int trace,
int node,
struct kernel_clone_args *args)
{
int pidfd = -1, retval;
struct task_struct *p;
struct multiprocess_signals delayed;
struct file *pidfile = NULL;
const u64 clone_flags = args->flags;
struct nsproxy *nsp = current->nsproxy;
/*
* Don't allow sharing the root directory with processes in a different
* namespace
*/
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
return ERR_PTR(-EINVAL);
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
*/
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
return ERR_PTR(-EINVAL);
/*
* Shared signal handlers imply shared VM. By way of the above,
* thread groups also imply shared VM. Blocking this case allows
* for various simplifications in other code.
*/
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL);
/*
* Siblings of global init remain as zombies on exit since they are
* not reaped by their parent (swapper). To solve this and to avoid
* multi-rooted process trees, prevent global and container-inits
* from creating siblings.
*/
if ((clone_flags & CLONE_PARENT) &&
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL);
/*
* If the new process will be in a different pid or user namespace
* do not allow it to share a thread group with the forking task.
*/
if (clone_flags & CLONE_THREAD) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
(task_active_pid_ns(current) != nsp->pid_ns_for_children))
return ERR_PTR(-EINVAL);
}
if (clone_flags & CLONE_PIDFD) {
/*
* - CLONE_DETACHED is blocked so that we can potentially
* reuse it later for CLONE_PIDFD.
*/
if (clone_flags & CLONE_DETACHED)
return ERR_PTR(-EINVAL);
}
/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
* processes that happen during the fork and delay them so that
* they appear to happen after the fork.
*/
sigemptyset(&delayed.signal);
INIT_HLIST_NODE(&delayed.node);
spin_lock_irq(¤t->sighand->siglock);
if (!(clone_flags & CLONE_THREAD))
hlist_add_head(&delayed.node, ¤t->signal->multiprocess);
recalc_sigpending();
spin_unlock_irq(¤t->sighand->siglock);
retval = -ERESTARTNOINTR;
if (task_sigpending(current))
goto fork_out;
retval = -ENOMEM;
p = dup_task_struct(current, node);
if (!p)
goto fork_out;
p->flags &= ~PF_KTHREAD;
if (args->kthread)
p->flags |= PF_KTHREAD;
if (args->user_worker) {
/*
* Mark us a user worker, and block any signal that isn't
* fatal or STOP
*/
p->flags |= PF_USER_WORKER;
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
}
if (args->io_thread)
p->flags |= PF_IO_WORKER;
if (args->name)
strscpy_pad(p->comm, args->name, sizeof(p->comm));
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
ftrace_graph_init_task(p);
rt_mutex_init_task(p);
lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = copy_creds(p, clone_flags);
if (retval < 0)
goto bad_fork_free;
retval = -EAGAIN;
if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_cleanup_count;
}
current->flags &= ~PF_NPROC_EXCEEDED;
/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
* to stop root fork bombs.
*/
retval = -EAGAIN;
if (data_race(nr_threads >= max_threads))
goto bad_fork_cleanup_count;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
rcu_copy_process(p);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
init_sigpending(&p->pending);
p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
p->utimescaled = p->stimescaled = 0;
#endif
prev_cputime_init(&p->prev_cputime);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqcount_init(&p->vtime.seqcount);
p->vtime.starttime = 0;
p->vtime.state = VTIME_INACTIVE;
#endif
#ifdef CONFIG_IO_URING
p->io_uring = NULL;
#endif
p->default_timer_slack_ns = current->timer_slack_ns;
#ifdef CONFIG_PSI
p->psi_flags = 0;
#endif
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
posix_cputimers_init(&p->posix_cputimers);
p->io_context = NULL;
audit_set_context(p, NULL);
cgroup_fork(p);
if (args->kthread) {
if (!set_kthread_struct(p))
goto bad_fork_cleanup_delayacct;
}
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
goto bad_fork_cleanup_delayacct;
}
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
memset(&p->irqtrace, 0, sizeof(p->irqtrace));
p->irqtrace.hardirq_disable_ip = _THIS_IP_;
p->irqtrace.softirq_enable_ip = _THIS_IP_;
p->softirqs_enabled = 1;
p->softirq_context = 0;
#endif
p->pagefault_disabled = 0;
#ifdef CONFIG_LOCKDEP
lockdep_init_task(p);
#endif
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
#endif
#ifdef CONFIG_BPF_SYSCALL
RCU_INIT_POINTER(p->bpf_storage, NULL);
p->bpf_ctx = NULL;
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
goto bad_fork_cleanup_policy;
retval = perf_event_init_task(p, clone_flags);
if (retval)
goto bad_fork_cleanup_policy;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_perf;
/* copy all the process information */
shm_init_task(p);
retval = security_task_alloc(p, clone_flags);
if (retval)
goto bad_fork_cleanup_audit;
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_security;
retval = copy_files(clone_flags, p, args->no_files);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
if (retval)
goto bad_fork_cleanup_files;
retval = copy_sighand(clone_flags, p);
if (retval)
goto bad_fork_cleanup_fs;
retval = copy_signal(clone_flags, p);
if (retval)
goto bad_fork_cleanup_sighand;
retval = copy_mm(clone_flags, p);
if (retval)
goto bad_fork_cleanup_signal;
retval = copy_namespaces(clone_flags, p);
if (retval)
goto bad_fork_cleanup_mm;
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
retval = copy_thread(p, args);
if (retval)
goto bad_fork_cleanup_io;
stackleak_task_init(p);
if (pid != &init_struct_pid) {
pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
args->set_tid_size);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread;
}
}
/*
* This has to happen after we've potentially unshared the file
* descriptor table (so that the pidfd doesn't leak into the child
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
/* Note that no task has been attached to @pid yet. */
retval = __pidfd_prepare(pid, flags, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
retval = put_user(pidfd, args->pidfd);
if (retval)
goto bad_fork_put_pidfd;
}
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
futex_init_task(p);
/*
* sigaltstack should be cleared when sharing the same VM
*/
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
sas_ss_reset(p);
/*
* Syscall tracing and stepping should be turned off in the
* child regardless of CLONE_PTRACE.
*/
user_disable_single_step(p);
clear_task_syscall_work(p, SYSCALL_TRACE);
#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
clear_task_syscall_work(p, SYSCALL_EMU);
#endif
clear_tsk_latency_tracing(p);
/* ok, now we should be set up.. */
p->pid = pid_nr(pid);
if (clone_flags & CLONE_THREAD) {
p->group_leader = current->group_leader;
p->tgid = current->tgid;
} else {
p->group_leader = p;
p->tgid = p->pid;
}
p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
p->dirty_paused_when = 0;
p->pdeath_signal = 0;
p->task_works = NULL;
clear_posix_cputimers_work(p);
#ifdef CONFIG_KRETPROBES
p->kretprobe_instances.first = NULL;
#endif
#ifdef CONFIG_RETHOOK
p->rethooks.first = NULL;
#endif
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted that the new process's css_set can be changed
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
retval = cgroup_can_fork(p, args);
if (retval)
goto bad_fork_put_pidfd;
/*
* Now that the cgroups are pinned, re-clone the parent cgroup and put
* the new task on the correct runqueue. All this *before* the task
* becomes visible.
*
* This isn't part of ->can_fork() because while the re-cloning is
* cgroup specific, it unconditionally needs to place the task on a
* runqueue.
*/
sched_cgroup_fork(p, args);
/*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
* not want user-space to be able to predict the process start-time by
* stalling fork(2) after we recorded the start_time but before it is
* visible to the system.
*/
p->start_time = ktime_get_ns();
p->start_boottime = ktime_get_boottime_ns();
/*
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
write_lock_irq(&tasklist_lock);
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
if (clone_flags & CLONE_THREAD)
p->exit_signal = -1;
else
p->exit_signal = current->group_leader->exit_signal;
} else {
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
p->exit_signal = args->exit_signal;
}
klp_copy_process(p);
sched_core_fork(p);
spin_lock(¤t->sighand->siglock);
rv_task_fork(p);
rseq_fork(p, clone_flags);
/* Don't start children in a dying pid namespace */
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
goto bad_fork_cancel_cgroup;
}
/* Let kill terminate clone/fork in the middle */
if (fatal_signal_pending(current)) {
retval = -EINTR;
goto bad_fork_cancel_cgroup;
}
/* No more failure paths after this point. */
/*
* Copy seccomp details explicitly here, in case they were changed
* before holding sighand lock.
*/
copy_seccomp(p);
init_task_pid_links(p);
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
init_task_pid(p, PIDTYPE_PID, pid);
if (thread_group_leader(p)) {
init_task_pid(p, PIDTYPE_TGID, pid);
init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
init_task_pid(p, PIDTYPE_SID, task_session(current));
if (is_child_reaper(pid)) {
ns_of_pid(pid)->child_reaper = p;
p->signal->flags |= SIGNAL_UNKILLABLE;
}
p->signal->shared_pending.signal = delayed.signal;
p->signal->tty = tty_kref_get(current->signal->tty);
/*
* Inherit has_child_subreaper flag under the same
* tasklist_lock with adding child to the process tree
* for propagate_has_child_subreaper optimization.
*/
p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
p->real_parent->signal->is_child_subreaper;
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
attach_pid(p, PIDTYPE_TGID);
attach_pid(p, PIDTYPE_PGID);
attach_pid(p, PIDTYPE_SID);
__this_cpu_inc(process_counts);
} else {
current->signal->nr_threads++;
current->signal->quick_threads++;
atomic_inc(¤t->signal->live);
refcount_inc(¤t->signal->sigcnt);
task_join_group_stop(p);
list_add_tail_rcu(&p->thread_node,
&p->signal->thread_head);
}
attach_pid(p, PIDTYPE_PID);
nr_threads++;
}
total_forks++;
hlist_del_init(&delayed.node);
spin_unlock(¤t->sighand->siglock);
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
if (pidfile)
fd_install(pidfd, pidfile);
proc_fork_connector(p);
sched_post_fork(p);
cgroup_post_fork(p, args);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
user_events_fork(p, clone_flags);
copy_oom_score_adj(clone_flags, p);
return p;
bad_fork_cancel_cgroup:
sched_core_free(p);
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
if (clone_flags & CLONE_PIDFD) {
fput(pidfile);
put_unused_fd(pidfd);
}
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_thread:
exit_thread(p);
bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
exit_task_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm) {
mm_clear_owner(p->mm, p);
mmput(p->mm);
}
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
exit_fs(p); /* blocking */
bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
exit_sem(p);
bad_fork_cleanup_security:
security_task_free(p);
bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_perf:
perf_event_free_task(p);
bad_fork_cleanup_policy:
lockdep_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
#endif
bad_fork_cleanup_delayacct:
delayacct_tsk_free(p);
bad_fork_cleanup_count:
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
exit_creds(p);
bad_fork_free:
WRITE_ONCE(p->__state, TASK_DEAD);
exit_task_stack_account(p);
put_task_stack(p);
delayed_free_task(p);
fork_out:
spin_lock_irq(¤t->sighand->siglock);
hlist_del_init(&delayed.node);
spin_unlock_irq(¤t->sighand->siglock);
return ERR_PTR(retval);
}
static inline void init_idle_pids(struct task_struct *idle)
{
enum pid_type type;
for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
init_task_pid(idle, type, &init_struct_pid);
}
}
static int idle_dummy(void *dummy)
{
/* This function is never called */
return 0;
}
struct task_struct * __init fork_idle(int cpu)
{
struct task_struct *task;
struct kernel_clone_args args = {
.flags = CLONE_VM,
.fn = &idle_dummy,
.fn_arg = NULL,
.kthread = 1,
.idle = 1,
};
task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
if (!IS_ERR(task)) {
init_idle_pids(task);
init_idle(task, cpu);
}
return task;
}
/*
* This is like kernel_clone(), but shaved down and tailored to just
* creating io_uring workers. It returns a created task, or an error pointer.
* The returned task is inactive, and the caller must fire it up through
* wake_up_new_task(p). All signals are blocked in the created task.
*/
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
CLONE_IO;
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
CLONE_UNTRACED) & ~CSIGNAL),
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.fn = fn,
.fn_arg = arg,
.io_thread = 1,
.user_worker = 1,
};
return copy_process(NULL, 0, node, &args);
}
/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*
* args->exit_signal is expected to be checked for sanity by the caller.
*/
pid_t kernel_clone(struct kernel_clone_args *args)
{
u64 clone_flags = args->flags;
struct completion vfork;
struct pid *pid;
struct task_struct *p;
int trace = 0;
pid_t nr;
/*
* For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
* to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
* mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
* field in struct clone_args and it still doesn't make sense to have
* them both point at the same memory location. Performing this check
* here has the advantage that we don't need to have a separate helper
* to check for legacy clone().
*/
if ((clone_flags & CLONE_PIDFD) &&
(clone_flags & CLONE_PARENT_SETTID) &&
(args->pidfd == args->parent_tid))
return -EINVAL;
/*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if (args->exit_signal != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
if (likely(!ptrace_event_enabled(current, trace)))
trace = 0;
}
p = copy_process(NULL, trace, NUMA_NO_NODE, args);
add_latent_entropy();
if (IS_ERR(p))
return PTR_ERR(p);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
trace_sched_process_fork(current, p);
pid = get_task_pid(p, PIDTYPE_PID);
nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, args->parent_tid);
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
get_task_struct(p);
}
if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
/* lock the task to synchronize with memcg migration */
task_lock(p);
lru_gen_add_mm(p->mm);
task_unlock(p);
}
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
ptrace_event_pid(trace, pid);
if (clone_flags & CLONE_VFORK) {
if (!wait_for_vfork_done(p, &vfork))
ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
put_pid(pid);
return nr;
}
/*
* Create a kernel thread.
*/
pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
unsigned long flags)
{
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
CLONE_UNTRACED) & ~CSIGNAL),
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.fn = fn,
.fn_arg = arg,
.name = name,
.kthread = 1,
};
return kernel_clone(&args);
}
/*
* Create a user mode thread.
*/
pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
CLONE_UNTRACED) & ~CSIGNAL),
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.fn = fn,
.fn_arg = arg,
};
return kernel_clone(&args);
}
#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
struct kernel_clone_args args = {
.exit_signal = SIGCHLD,
};
return kernel_clone(&args);
#else
/* can not support in nommu mode */
return -EINVAL;
#endif
}
#endif
#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
struct kernel_clone_args args = {
.flags = CLONE_VFORK | CLONE_VM,
.exit_signal = SIGCHLD,
};
return kernel_clone(&args);
}
#endif
#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
unsigned long, tls,
int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
int __user *, parent_tidptr,
int __user *, child_tidptr,
unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
int, stack_size,
int __user *, parent_tidptr,
int __user *, child_tidptr,
unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
int __user *, child_tidptr,
unsigned long, tls)
#endif
{
struct kernel_clone_args args = {
.flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
.pidfd = parent_tidptr,
.child_tid = child_tidptr,
.parent_tid = parent_tidptr,
.exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
.stack = newsp,
.tls = tls,
};
return kernel_clone(&args);
}
#endif
#ifdef __ARCH_WANT_SYS_CLONE3
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t usize)
{
int err;
struct clone_args args;
pid_t *kset_tid = kargs->set_tid;
BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
CLONE_ARGS_SIZE_VER0);
BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
CLONE_ARGS_SIZE_VER1);
BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
CLONE_ARGS_SIZE_VER2);
BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
if (unlikely(usize > PAGE_SIZE))
return -E2BIG;
if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
return -EINVAL;
err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
if (err)
return err;
if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
return -EINVAL;
if (unlikely(!args.set_tid && args.set_tid_size > 0))
return -EINVAL;
if (unlikely(args.set_tid && args.set_tid_size == 0))
return -EINVAL;
/*
* Verify that higher 32bits of exit_signal are unset and that
* it is a valid signal
*/
if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
!valid_signal(args.exit_signal)))
return -EINVAL;
if ((args.flags & CLONE_INTO_CGROUP) &&
(args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
return -EINVAL;
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
.child_tid = u64_to_user_ptr(args.child_tid),
.parent_tid = u64_to_user_ptr(args.parent_tid),
.exit_signal = args.exit_signal,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
.set_tid_size = args.set_tid_size,
.cgroup = args.cgroup,
};
if (args.set_tid &&
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
(kargs->set_tid_size * sizeof(pid_t))))
return -EFAULT;
kargs->set_tid = kset_tid;
return 0;
}
/**
* clone3_stack_valid - check and prepare stack
* @kargs: kernel clone args
*
* Verify that the stack arguments userspace gave us are sane.
* In addition, set the stack direction for userspace since it's easy for us to
* determine.
*/
static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
{
if (kargs->stack == 0) {
if (kargs->stack_size > 0)
return false;
} else {
if (kargs->stack_size == 0)
return false;
if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
return false;
#if !defined(CONFIG_STACK_GROWSUP)
kargs->stack += kargs->stack_size;
#endif
}
return true;
}
static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
/* Verify that no unknown flags are passed along. */
if (kargs->flags &
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
return false;
/*
* - make the CLONE_DETACHED bit reusable for clone3
* - make the CSIGNAL bits reusable for clone3
*/
if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
return false;
if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
(CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
return false;
if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
kargs->exit_signal)
return false;
if (!clone3_stack_valid(kargs))
return false;
return true;
}
/**
* sys_clone3 - create a new process with specific properties
* @uargs: argument structure
* @size: size of @uargs
*
* clone3() is the extensible successor to clone()/clone2().
* It takes a struct as argument that is versioned by its size.
*
* Return: On success, a positive PID for the child process.
* On error, a negative errno number.
*/
SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
{
int err;
struct kernel_clone_args kargs;
pid_t set_tid[MAX_PID_NS_LEVEL];
kargs.set_tid = set_tid;
err = copy_clone_args_from_user(&kargs, uargs, size);
if (err)
return err;
if (!clone3_args_valid(&kargs))
return -EINVAL;
return kernel_clone(&kargs);
}
#endif
void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
{
struct task_struct *leader, *parent, *child;
int res;
read_lock(&tasklist_lock);
leader = top = top->group_leader;
down:
for_each_thread(leader, parent) {
list_for_each_entry(child, &parent->children, sibling) {
res = visitor(child, data);
if (res) {
if (res < 0)
goto out;
leader = child;
goto down;
}
up:
;
}
}
if (leader != top) {
child = leader;
parent = child->real_parent;
leader = parent->group_leader;
goto up;
}
out:
read_unlock(&tasklist_lock);
}
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
static void sighand_ctor(void *data)
{
struct sighand_struct *sighand = data;
spin_lock_init(&sighand->siglock);
init_waitqueue_head(&sighand->signalfd_wqh);
}
void __init mm_cache_init(void)
{
unsigned int mm_size;
/*
* The mm_cpumask is located at the end of mm_struct, and is
* dynamically sized based on the maximum CPU number this system
* can have, taking hotplug into account (nr_cpu_ids).
*/
mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
mm_cachep = kmem_cache_create_usercopy("mm_struct",
mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
offsetof(struct mm_struct, saved_auxv),
sizeof_field(struct mm_struct, saved_auxv),
NULL);
}
void __init proc_caches_init(void)
{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
SLAB_ACCOUNT, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
#ifdef CONFIG_PER_VMA_LOCK
vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
#endif
mmap_init();
nsproxy_cache_init();
}
/*
* Check constraints on flags passed to the unshare system call.
*/
static int check_unshare_flags(unsigned long unshare_flags)
{
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
CLONE_NEWTIME))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing
* to unshare. Note that unsharing the address space or the
* signal handlers also need to unshare the signal queues (aka
* CLONE_THREAD).
*/
if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
if (!thread_group_empty(current))
return -EINVAL;
}
if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
if (refcount_read(¤t->sighand->count) > 1)
return -EINVAL;
}
if (unshare_flags & CLONE_VM) {
if (!current_is_single_threaded())
return -EINVAL;
}
return 0;
}
/*
* Unshare the filesystem structure if it is being shared
*/
static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
{
struct fs_struct *fs = current->fs;
if (!(unshare_flags & CLONE_FS) || !fs)
return 0;
/* don't need lock here; in the worst case we'll do useless copy */
if (fs->users == 1)
return 0;
*new_fsp = copy_fs_struct(fs);
if (!*new_fsp)
return -ENOMEM;
return 0;
}
/*
* Unshare file descriptor table if it is being shared
*/
int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
struct files_struct **new_fdp)
{
struct files_struct *fd = current->files;
int error = 0;
if ((unshare_flags & CLONE_FILES) &&
(fd && atomic_read(&fd->count) > 1)) {
*new_fdp = dup_fd(fd, max_fds, &error);
if (!*new_fdp)
return error;
}
return 0;
}
/*
* unshare allows a process to 'unshare' part of the process
* context which was originally shared using clone. copy_*
* functions used by kernel_clone() cannot be used here directly
* because they modify an inactive task_struct that is being
* constructed. Here we are modifying the current, active,
* task_struct.
*/
int ksys_unshare(unsigned long unshare_flags)
{
struct fs_struct *fs, *new_fs = NULL;
struct files_struct *new_fd = NULL;
struct cred *new_cred = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
int err;
/*
* If unsharing a user namespace must also unshare the thread group
* and unshare the filesystem root and working directories.
*/
if (unshare_flags & CLONE_NEWUSER)
unshare_flags |= CLONE_THREAD | CLONE_FS;
/*
* If unsharing vm, must also unshare signal handlers.
*/
if (unshare_flags & CLONE_VM)
unshare_flags |= CLONE_SIGHAND;
/*
* If unsharing a signal handlers, must also unshare the signal queues.
*/
if (unshare_flags & CLONE_SIGHAND)
unshare_flags |= CLONE_THREAD;
/*
* If unsharing namespace, must also unshare filesystem information.
*/
if (unshare_flags & CLONE_NEWNS)
unshare_flags |= CLONE_FS;
err = check_unshare_flags(unshare_flags);
if (err)
goto bad_unshare_out;
/*
* CLONE_NEWIPC must also detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
* namespace are unreachable.
*/
if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
do_sysvsem = 1;
err = unshare_fs(unshare_flags, &new_fs);
if (err)
goto bad_unshare_out;
err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
if (err)
goto bad_unshare_cleanup_fs;
err = unshare_userns(unshare_flags, &new_cred);
if (err)
goto bad_unshare_cleanup_fd;
err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
new_cred, new_fs);
if (err)
goto bad_unshare_cleanup_cred;
if (new_cred) {
err = set_cred_ucounts(new_cred);
if (err)
goto bad_unshare_cleanup_cred;
}
if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
if (do_sysvsem) {
/*
* CLONE_SYSVSEM is equivalent to sys_exit().
*/
exit_sem(current);
}
if (unshare_flags & CLONE_NEWIPC) {
/* Orphan segments in old ns (see sem above). */
exit_shm(current);
shm_init_task(current);
}
if (new_nsproxy)
switch_task_namespaces(current, new_nsproxy);
task_lock(current);
if (new_fs) {
fs = current->fs;
spin_lock(&fs->lock);
current->fs = new_fs;
if (--fs->users)
new_fs = NULL;
else
new_fs = fs;
spin_unlock(&fs->lock);
}
if (new_fd)
swap(current->files, new_fd);
task_unlock(current);
if (new_cred) {
/* Install the new user namespace */
commit_creds(new_cred);
new_cred = NULL;
}
}
perf_event_namespaces(current);
bad_unshare_cleanup_cred:
if (new_cred)
put_cred(new_cred);
bad_unshare_cleanup_fd:
if (new_fd)
put_files_struct(new_fd);
bad_unshare_cleanup_fs:
if (new_fs)
free_fs_struct(new_fs);
bad_unshare_out:
return err;
}
SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
return ksys_unshare(unshare_flags);
}
/*
* Helper to unshare the files of the current task.
* We don't want to expose copy_files internals to
* the exec layer of the kernel.
*/
int unshare_files(void)
{
struct task_struct *task = current;
struct files_struct *old, *copy = NULL;
int error;
error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©);
if (error || !copy)
return error;
old = task->files;
task_lock(task);
task->files = copy;
task_unlock(task);
put_files_struct(old);
return 0;
}
int sysctl_max_threads(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int ret;
int threads = max_threads;
int min = 1;
int max = MAX_THREADS;
t = *table;
t.data = &threads;
t.extra1 = &min;
t.extra2 = &max;
ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
max_threads = threads;
return 0;
}