Initial check-in Based on Chromium revision f3b63e7356ad0846045fe69dd640781e95728486
diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..13d16fb --- /dev/null +++ b/AUTHORS
@@ -0,0 +1,1090 @@ +# Names should be added to this file with this pattern: +# +# For individuals: +# Name <email address> +# +# For organizations: +# Organization <fnmatch pattern> +# +# See python fnmatch module documentation for more information. + +Aaron Boushley <boushley@gmail.com> +Aaron Jacobs <samusaaron3@gmail.com> +Aaron Leventhal <aaronlevbugs@gmail.com> +Aaron Randolph <aaron.randolph@gmail.com> +Aaryaman Vasishta <jem456.vasishta@gmail.com> +Abdu Ameen <abdu.ameen000@gmail.com> +Abhijeet Kandalkar <abhijeet.k@samsung.com> +Abhishek Agarwal <abhishek.a21@samsung.com> +Abhishek Kanike <abhishek.ka@samsung.com> +Abhishek Singh <abhi.rathore@samsung.com> +Adam Bonner <abonner-chromium@solscope.com> +Adam Bujalski <abujalski@gmail.com> +Adam Kallai <kadam@inf.u-szeged.hu> +Adam Roben <adam@github.com> +Adam Treat <adam.treat@samsung.com> +Adam Yi <i@adamyi.com> +Addanki Gandhi Kishor <kishor.ag@samsung.com> +Adenilson Cavalcanti <a.cavalcanti@samsung.com> +Aditya Bhargava <heuristicist@gmail.com> +Adrian Belgun <adrian.belgun@intel.com> +Ahmet Emir Ercin <ahmetemiremir@gmail.com> +Ajay Berwal <a.berwal@samsung.com> +Ajay Berwal <ajay.berwal@samsung.com> +Ajith Kumar V <ajith.v@samsung.com> +Aku Kotkavuo <a.kotkavuo@partner.samsung.com> +Aldo Culquicondor <alculquicondor@gmail.com> +Aleksandar Stojiljkovic <aleksandar.stojiljkovic@intel.com> +Alex Gabriel <minilogo@gmail.com> +Alex Gartrell <agartrell@cmu.edu> +Alex Henrie <alexhenrie24@gmail.com> +Alex Scheele <alexscheele@gmail.com> +Alexander Douglas <agdoug@amazon.com> +Alexander Guettler <alexander@guettler.io> +Alexander Shalamov <alexander.shalamov@intel.com> +Alexander Sulfrian <alexander@sulfrian.net> +Alexandre Abreu <wiss1976@gmail.com> +Alexandru Chiculita <achicu@adobe.com> +Alexey Korepanov <alexkorep@gmail.com> +Alexey Kuts <kruntuid@gmail.com> +Alexey Kuzmin <alex.s.kuzmin@gmail.com> +Alexey Kuznetsov <saturas2000@gmail.com> +Alexis Brenon <brenon.alexis@gmail.com> +Alexis La Goutte <alexis.lagoutte@gmail.com> +Alexis Menard <alexis.menard@intel.com> +Alfredo Hernandez <ahernandez.miralles@gmail.com> +Ali Vathi <ali.akbar@gmail.com> +Allan Sandfeld Jensen <allan.jensen@qt.io> +Alper Çakan <alpercakan98@gmail.com> +Ambarish Rapte <ambarish.r@samsung.com> +Amey Jahagirdar <jahagird@amazon.com> +Amit Sarkar <amit.srkr@samsung.com> +Amogh Bihani <amogh.bihani@samsung.com> +Amos Lim <amoseui@gmail.com> +Amos Lim <eui-sang.lim@samsung.com> +Amruth Raj <amruthraj@motorola.com> +Amruth Raj <ckqr36@motorola.com> +Anand Ratn <anand.ratn@samsung.com> +Anastasios Cassiotis <tom.cassiotis@gmail.com> +anatoly techtonik <techtonik@gmail.com> +Ancil George <ancilgeorge@samsung.com> +Andra Paraschiv <andra.paraschiv@intel.com> +Andrei Borza <andrei.borza@gmail.com> +Andrei Parvu <andrei.prv@gmail.com> +Andrei Parvu <parvu@adobe.com> +Andrew Boyarshin <andrew.boyarshin@gmail.com> +Andrew Brampton <me@bramp.net> +Andrew Hung <andrhung@amazon.com> +Andrew Jorgensen <ajorgens@amazon.com> +Andrew MacPherson <andrew.macpherson@soundtrap.com> +Andrew Tulloch <andrew@tullo.ch> +Anish Patankar <anish.p@samsung.com> +Ankit Kumar <ankit2.kumar@samsung.com> +Ankur Verma <ankur1.verma@samsung.com> +Anna Henningsen <anna@addaleax.net> +Anne Kao <annekao94@gmail.com> +Anssi Hannula <anssi.hannula@iki.fi> +Anthony Halliday <anth.halliday12@gmail.com> +Anton Obzhirov <a.obzhirov@samsung.com> +Antonin Hildebrand <antonin.hildebrand@gmail.com> +Antonio Gomes <a1.gomes@sisa.samsung.com> +Anuj Kumar Sharma <anujk.sharma@samsung.com> +Arjun Karthik <arjunkar@amazon.com> +Arman Ghotb <armanghotb@gmail.com> +Armin Burgmeier <aburgmeier@bloomberg.net> +Arnaud Mandy <arnaud.mandy@intel.com> +Arnaud Renevier <a.renevier@samsung.com> +Arpita Bahuguna <a.bah@samsung.com> +Arthur Lussos <developer0420@gmail.com> +Arun Kulkarni <kulkarni.a@samsung.com> +Arun Kumar <arun87.kumar@samsung.com> +Arun Mankuzhi <arun.m@samsung.com> +Arunoday Sarkar <a.sarkar.arun@gmail.com> +Arunprasad Rajkumar <ararunprasad@gmail.com> +Arunprasad Rajkumar <arurajku@cisco.com> +Asami Doi <d0iasm.pub@gmail.com> +Ashish Kumar Gupta <guptaag@amazon.com> +Ashlin Joseph <ashlin.j@samsung.com> +Asish Singh <asish.singh@samsung.com> +Attila Dusnoki <dati91@gmail.com> +Avinaash Doreswamy <avi.nitk@samsung.com> +Ayush Khandelwal <k.ayush@samsung.com> +Azhar Shaikh <azhar.shaikh@intel.com> +Balazs Kelemen <b.kelemen@samsung.com> +Baul Eun <baul.eun@samsung.com> +Behara Mani Shyam Patro <behara.ms@samsung.com> +Bem Jones-Bey <bemajaniman@gmail.com> +Bem Jones-Bey <bjonesbe@adobe.com> +Ben Coe <bencoe@gmail.com> +Ben Fiola <benfiola@gmail.com> +Ben Karel <eschew@gmail.com> +Ben Noordhuis <ben@strongloop.com> +Benedek Heilig <benecene@gmail.com> +Benjamin Dupont <bedupont@cisco.com> +Benjamin Jemlich <pcgod99@gmail.com> +Bernard Cafarelli <voyageur@gentoo.org> +Bernhard M. Wiedemann <bwiedemann@suse.de> +Bert Belder <bertbelder@gmail.com> +Bhagirathi Satpathy <bhagirathi.s@samsung.com> +Bhanukrushana Rout <b.rout@samsung.com> +Biljith Jayan <billy.jayan@samsung.com> +Boaz Sender <boaz@bocoup.com> +Bobby Powers <bobbypowers@gmail.com> +Branden Archer <bma4@zips.uakron.edu> +Brendan Kirby <brendan.kirby@imgtec.com> +Brendan Long <self@brendanlong.com> +Brian G. Merrell <bgmerrell@gmail.com> +Brian Konzman, SJ <b.g.konzman@gmail.com> +Brian Luft <brian@electroly.com> +Brian Merrell, Novell Inc. <bgmerrell@gmail.com> +Brian Yip <itsbriany@gmail.com> +Bruno Calvignac <bruno@flock.com> +Bruno de Oliveira Abinader <brunoabinader@gmail.com> +Bruno Roy <brusi_roy@hotmail.com> +Bryan Donlan <bdonlan@gmail.com> +Bryce Thomas <bryct@amazon.com> +Burton <burton@typewritten.net> +Byounghoon Yoon <bill.2714@kakaocorp.com> +Byoungkwon Ko <codeimpl@gmail.com> +Byungwoo Lee <bw80.lee@samsung.com> +Caesar Wang <wxt@rock-chips.com> +Caio Marcelo de Oliveira Filho <caio.de.oliveira.filho@intel.com> +Caitlin Potter <caitpotter88@gmail.com> +Calvin Mei <calvimei@amazon.com> +Cameron Gutman <aicommander@gmail.com> +Catalin Badea <badea@adobe.com> +Cathie Chen <cathiechen@tencent.com> +Cem Kocagil <cem.kocagil@gmail.com> +Chakshu Ahuja <chakshu.a@samsung.com> +Chamal De Silva <chamalsl@yahoo.com> +Chandan Padhi <c.padhi@samsung.com> +Chandra Shekar Vallala <brk376@motorola.com> +Chandramouli Sanchi <cm.sanchi@samsung.com> +Chang Shu <c.shu@samsung.com> +Changbin Shao <changbin.shao@intel.com> +Changjun Yang <changjun.yang@intel.com> +ChangSeok Oh <shivamidow@gmail.com> +Changwan Hong <changwan.hong@navercorp.com> +Changyeon Kim <cyzero.kim@samsung.com> +Chanho Park <parkch98@gmail.com> +Chansik Yun <chansik.yun@gmail.com> +Chaobin Zhang <zhchbin@gmail.com> +Charles Vaughn <cvaughn@gmail.com> +Choongwoo Han <cwhan.tunz@gmail.com> +Chris Greene <cwgreene@amazon.com> +Chris Harrelson <chrishtr@gmail.com> +Chris Nardi <hichris123@gmail.com> +Chris Szurgot <szurgotc@amazon.com> +Chris Tserng <tserng@amazon.com> +Chris Vasselli <clindsay@gmail.com> +Christophe Dumez <ch.dumez@samsung.com> +Christopher Dale <chrelad@gmail.com> +Claudio DeSouza <claudiomdsjr@gmail.com> +Clemens Fruhwirth <clemens@endorphin.org> +Clement Scheelfeldt Skau <clementskau@gmail.com> +Clinton Staley <clintstaley@gmail.com> +Connor Pearson <cjp822@gmail.com> +Craig Schlenter <craig.schlenter@gmail.com> +Csaba Osztrogonác <ossy.szeged@gmail.com> +Daegyu Lee <na7jun8gi@gmail.com> +Dai Chunyang <chunyang.dai@intel.com> +Daiwei Li <daiweili@suitabletech.com> +Damien Marié <damien@dam.io> +Dan McCombs <overridex@gmail.com> +Daniel Bevenius <daniel.bevenius@gmail.com> +Daniel Bomar <dbdaniel42@gmail.com> +Daniel Carvalho Liedke <dliedke@gmail.com> +Daniel Charles <daniel.charles@intel.com> +Daniel Imms <daniimms@amazon.com> +Daniel Johnson <danielj41@gmail.com> +Daniel Lockyer <thisisdaniellockyer@gmail.com> +Daniel Nishi <dhnishi@gmail.com> +Daniel Platz <daplatz@googlemail.com> +Daniel Shaulov <dshaulov@ptc.com> +Daniel Trebbien <dtrebbien@gmail.com> +Daniel Waxweiler <daniel.waxweiler@gmail.com> +Dániel Bátyai <dbatyai@inf.u-szeged.hu> +Dániel Vince <vinced@inf.u-szeged.hu> +Darshini KN <kn.darshini@samsung.com> +Dave Barker <kzar@kzar.co.uk> +David Benjamin <davidben@mit.edu> +David Davidovic <david@davidovic.io> +David Erceg <erceg.david@gmail.com> +David Fox <david@davidjfox.com> +David Futcher <david.mike.futcher@gmail.com> +David Leen <davileen@amazon.com> +David McAllister <mcdavid@amazon.com> +David Michael Barr <david.barr@samsung.com> +David Spellman <dspell@amazon.com> +David Valachovic <adenflorian@gmail.com> +Dax Kelson <dkelson@gurulabs.com> +Debashish Samantaray <d.samantaray@samsung.com> +Debug Wang <debugwang@tencent.com> +Deepak Dilip Borade <deepak.db@samsung.com> +Deepak Mittal <deepak.m1@samsung.com> +Deepak Sharma <deepak.sharma@amd.com> +Deepak Singla <deepak.s@samsung.com> +Deokjin Kim <deokjin81.kim@samsung.com> +Derek Halman <d.halman@gmail.com> +Devlin Cronin <rdevlin.cronin@gmail.com> +Diana Suvorova <diana.suvorova@gmail.com> +Diego Ferreiro Val <elfogris@gmail.com> +Dillon Sellars <dill.sellars@gmail.com> +Divya Bansal <divya.bansal@samsung.com> +Dominic Farolino <domfarolino@gmail.com> +Dominic Jodoin <dominic.jodoin@gmail.com> +Dominik Röttsches <dominik.rottsches@intel.com> +Don Woodward <woodward@adobe.com> +Donghee Na <corona10@gmail.com> +Dong-hee Na <donghee.na92@gmail.com> +Dongie Agnir <dongie.agnir@gmail.com> +Dongjun Kim <djmix.kim@samsung.com> +Dongseong Hwang <dongseong.hwang@intel.com> +Dongwoo Joshua Im <dw.im@samsung.com> +Dongyu Lin <l2d4y3@gmail.com> +Donna Wu <donna.wu@intel.com> +Douglas F. Turner <doug.turner@gmail.com> +Dustin Doloff <doloffd@amazon.com> +Ebrahim Byagowi <ebrahim@gnu.org> +Ebrahim Byagowi <ebraminio@gmail.com> +Eden Wang <nedenwang@tencent.com> +Eduardo Lima (Etrunko) <eblima@gmail.com> +Eduardo Lima (Etrunko) <eduardo.lima@intel.com> +Edward Baker <edward.baker@intel.com> +Edward Crossman <tedoc2000@gmail.com> +Eero Häkkinen <e.hakkinen@samsung.com> +Eero Häkkinen <eero.hakkinen@intel.com> +Egor Starkov <egor.starkov@samsung.com> +Ehsan Akhgari <ehsan.akhgari@gmail.com> +Elan Ruusamäe <elan.ruusamae@gmail.com> +Ergun Erdogmus <erdogmusergun@gmail.com> +Eric Ahn <byungwook.ahn@gmail.com> +Eric Rescorla <ekr@rtfm.com> +Erik Hill <erikghill@gmail.com> +Erik Sjölund <erik.sjolund@gmail.com> +Eriq Augustine <eriq.augustine@gmail.com> +Ernesto Mudu <ernesto.mudu@gmail.com> +Etienne Laurin <etienne@atnnn.com> +Eugene Kim <eugene70kim@gmail.com> +Eugene Sudin <eugene@sudin.pro> +Eunseok Oh <fivesilverstone@gmail.com> +Evan Peterson <evan.peterson.ep@gmail.com> +Evan Wallace <evan.exe@gmail.com> +Evangelos Foutras <evangelos@foutrelis.com> +Evgeniy Dushistov <dushistov@gmail.com> +Evgeny Agafonchikov <evgeny.agafonchikov@akvelon.com> +Fabien Tassin <fta@sofaraway.org> +Felix H. Dahlke <fhd@ubercode.de> +Fengrong Fang <fr.fang@samsung.com> +Fernando Jiménez Moreno <ferjmoreno@gmail.com> +Finbar Crago <finbar.crago@gmail.com> +François Beaufort <beaufort.francois@gmail.com> +Francois Kritzinger <francoisk777@gmail.com> +Francois Rauch <leopardb@gmail.com> +Frankie Dintino <fdintino@theatlantic.com> +Franklin Ta <fta2012@gmail.com> +Frédéric Jacob <frederic.jacob.78@gmail.com> +Frédéric Wang <fred.wang@free.fr> +Fu Junwei <junwei.fu@intel.com> +Gabor Rapcsanyi <g.rapcsanyi@samsung.com> +Gaetano Mendola <mendola@gmail.com> +Gajendra N <gajendra.n@samsung.com> +Gajendra Singh <wxjg68@motorola.com> +Ganesh Borle <ganesh.borle@samsung.com> +Gao Chun <chun.gao@intel.com> +Gao Chun <gaochun.dev@gmail.com> +Gaurav Dhol <gaurav.dhol@einfochips.com> +Gautham Banasandra <gautham.bangalore@gmail.com> +George Adams <geoada@amazon.com> +George Joseph <kottackal.george@gmail.com> +George Liaskos <geo.liaskos@gmail.com> +Georgy Buranov <gburanov@gmail.com> +Gergely Nagy <ngg@ngg.hu> +Getulio Sánchez <valentin2507@gmail.com> +Gideon Pyzer <gjpyzer@gmail.com> +Giovanni Panaro <tsrwebgl@gmail.com> +Girish Kumar M <mck.giri@samsung.com> +Gitanshu Mehndiratta <g.mehndiratt@samsung.com> +Giuseppe Iuculano <giuseppe@iuculano.it> +Gnanasekar Somanathan <gnanasekar.s@samsung.com> +Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com> +Goutham Jagannatha <wrm364@motorola.com> +Graham Yoakum <gyoakum@skobalt.com> +Greg Visser <gregvis@gmail.com> +Gregory Davis <gpdavis.chromium@gmail.com> +Grzegorz Czajkowski <g.czajkowski@samsung.com> +Guangzhen Li <guangzhen.li@intel.com> +Gurpreet Kaur <k.gurpreet@samsung.com> +Gustav Tiger <gustav.tiger@sonymobile.com> +Gyuyoung Kim <gyuyoung.kim@navercorp.com> +Gzob Qq <gzobqq@gmail.com> +Habib Virji <habib.virji@samsung.com> +Haeun Kim <ggrace.kim93@gmail.com> +Haeun Kim <haeungun@gmail.com> +Haitao Feng <haitao.feng@intel.com> +Halley Zhao <halley.zhao@intel.com> +Halton Huo <halton.huo@gmail.com> +Halton Huo <halton.huo@intel.com> +Hans Hillen <hans.hillen@gmail.com> +Hao Li <hao.x.li@intel.com> +Haojian Wu <hokein.wu@gmail.com> +Hari Singh <hari.singh1@samsung.com> +Harpreet Singh Khurana <harpreet.sk@samsung.com> +Harshikesh Kumar <harshikeshnobug@gmail.com> +Hassan Salehe Matar <hassansalehe@gmail.com> +Hautio Kari <khautio@gmail.com> +Heejin R. Chung <heejin.r.chung@samsung.com> +Heeyoun Lee <heeyoun.lee@samsung.com> +Henrique Limas <henrique.ramos.limas@gmail.com> +Himanshu Joshi <h.joshi@samsung.com> +Holger Kraus <kraush@amazon.com> +Hong Zheng <hong.zheng@intel.com> +Hongbo Min <hongbo.min@intel.com> +Horia Olaru <horia.olaru@gmail.com> +Horia Olaru <olaru@adobe.com> +Hosung You <hosung.you@samsung.com> +Huapeng Li <huapengl@amazon.com> +Huayong Xu <huayong.xu@samsung.com> +Hugo Holgersson <hugo.holgersson@sonymobile.com> +Hui Wang <wanghui07050707@gmail.com> +Huiwon Jo <jhwon0415@gmail.com> +Huy Duong <huy.duongdinh@gmail.com> +Hwanseung Lee <hs1217.lee@gmail.com> +Hwanseung Lee <hs1217.lee@samsung.com> +Hyemi Shin <hyemi.sin@samsung.com> +HyeockJin Kim <kherootz@gmail.com> +Hyungchan Kim <inlinechan@gmail.com> +Hyungwook Lee <hyungwook.lee@navercorp.com> +Hyungwook Lee <withlhw@gmail.com> +Hyunjun Shin <hyunjun.shin2@navercorp.com> +Hyunjune Kim <hyunjune.kim@samsung.com> +Hyunki Baik <hyunki.baik@samsung.com> +Ian Cullinan <cullinan@amazon.com> +Ian Scott <ian.scott@arteris.com> +Ibrar Ahmed <ibrar.ahmad@gmail.com> +Ilia Demianenko <ilia.demianenko@gmail.com> +Ilia K <ki.stfu@gmail.com> +Ilya Konstantinov <ilya.konstantinov@gmail.com> +Imranur Rahman <ir.shimul@gmail.com> +Ion Rosca <rosca@adobe.com> +Irmak Kavasoglu <irmakkavasoglu@gmail.com> +Isaac Murchie <murchieisaac@gmail.com> +Isaac Reilly <reillyi@amazon.com> +Ivan Naydonov <samogot@gmail.com> +Ivan Sham <ivansham@amazon.com> +Jack Bates <jack@nottheoilrig.com> +Jacob Clark <jacob.jh.clark@googlemail.com> +Jacob Mandelson <jacob@mandelson.org> +Jaehun Lim <ljaehun.lim@samsung.com> +Jaehyun Lee <j-hyun.lee@samsung.com> +Jaekyeom Kim <btapiz@gmail.com> +Jaemin Seo <jaemin86.seo@samsung.com> +Jaeseok Yoon <yjaeseok@gmail.com> +Jaewon Choi <jaewon.james.choi@gmail.com> +Jaeyong Bae <jdragon.bae@gmail.com> +Jaime Soriano Pastor <jsorianopastor@gmail.com> +Jake Helfert <jake@helfert.us> +Jake Hendy <me@jakehendy.com> +Jakob Weigert <jakob.j.w@googlemail.com> +Jakub Machacek <xtreit@gmail.com> +James Burton <jb@0.me.uk> +James Choi <jchoi42@pha.jhu.edu> +James Stanley <james@apphaus.co.uk> +James Vega <vega.james@gmail.com> +James Wei <james.wei@intel.com> +James Willcox <jwillcox@litl.com> +Jan Rucka <ruckajan10@gmail.com> +Jan Sauer <jan@jansauer.de> +Janwar Dinata <j.dinata@gmail.com> +Jared Shumway <jaredshumway94@gmail.com> +Jared Sohn <jared.sohn@gmail.com> +Jared Wein <weinjared@gmail.com> +Jari Karppanen <jkarp@amazon.com> +Jay Oster <jay@kodewerx.org> +Jay Soffian <jaysoffian@gmail.com> +Jeado Ko <haibane84@gmail.com> +Jeffrey C <jeffreyca16@gmail.com> +Jeongeun Kim <je_julie.kim@samsung.com> +Jeongmin Kim <kimwjdalsl@gmail.com> +Jeongwoo Park <skeksk91@gmail.com> +Jeremy Noring <jnoring@hirevue.com> +Jeremy Spiegel <jeremysspiegel@gmail.com> +Jeroen Van den Berghe <vandenberghe.jeroen@gmail.com> +Jerry Lin <wahahab11@gmail.com> +Jesper Storm Bache <jsbache@gmail.com> +Jesse Miller <jesse@jmiller.biz> +Jesus Sanchez-Palencia <jesus.sanchez-palencia.fernandez.fil@intel.com> +Jiadong Zhu <jiadong.zhu@linaro.org> +Jiajia Qin <jiajia.qin@intel.com> +Jiajie Hu <jiajie.hu@intel.com> +Jianjun Zhu <jianjun.zhu@intel.com> +Jianneng Zhong <muzuiget@gmail.com> +Jiawei Shao <jiawei.shao@intel.com> +Jie Chen <jie.a.chen@intel.com> +Jihoon Chung <j.c@navercorp.com> +Jihoon Chung <jihoon@gmail.com> +Jihun Brent Kim <devgrapher@gmail.com> +Jihwan Marc Kim <bluewhale.marc@gmail.com> +Jin Yang <jin.a.yang@intel.com> +Jincheol Jo <jincheol.jo@navercorp.com> +Jinfeng Ma <majinfeng1@xiaomi.com> +Jing Zhao <zhaojing7@xiaomi.com> +Jinglong Zuo <zuojinglong@xiaomi.com> +Jingwei Liu <kingweiliu@gmail.com> +Jingyi Wei <wjywbs@gmail.com> +Jinho Bang <jinho.bang@samsung.com> +Jinsong Fan <fanjinsong@sogou-inc.com> +Jinwoo Song <jinwoo7.song@samsung.com> +Jinyoung Hur <hurims@gmail.com> +Jitendra Kumar Sahoo <jitendra.ks@samsung.com> +Joachim Bauch <jbauch@webrtc.org> +Joachim Bauch <mail@joachim-bauch.de> +Joanmarie Diggs <joanmarie.diggs@gmail.com> +Joe Knoll <joe.knoll@workday.com> +Joe Thomas <mhx348@motorola.com> +Joel Stanley <joel@jms.id.au> +Johannes Rudolph <johannes.rudolph@googlemail.com> +John Kleinschmidt <kleinschmidtorama@gmail.com> +John Yani <vanuan@gmail.com> +John Yoo <nearbyh13@gmail.com> +Johnson Lin <johnson.lin@intel.com> +Jonathan Frazer <listedegarde@gmail.com> +Jonathan Garbee <jonathan@garbee.me> +Jonathan Hacker <jhacker@arcanefour.com> +Jongdeok Kim <jongdeok.kim@navercorp.com> +Jongheon Kim <sapzape@gmail.com> +JongKwon Lee <jongkwon.lee@navercorp.com> +Jongsoo Lee <leejongsoo@gmail.com> +Joone Hur <joone.hur@intel.com> +Joonghun Park <pjh0718@gmail.com> +Jorge Villatoro <jorge@tomatocannon.com> +Joseph Gentle <josephg@gmail.com> +Joseph Lolak <joseph.lolak@samsung.com> +Josh Triplett <josh.triplett@intel.com> +Josh Triplett <josh@joshtriplett.org> +Joshua Lock <joshua.lock@intel.com> +Joshua Roesslein <jroesslein@gmail.com> +Josué Ratelle <jorat1346@gmail.com> +Josyula Venkat Narasimham <venkat.nj@samsung.com> +Juan Jose Lopez Jaimez <jj.lopezjaimez@gmail.com> +Juhui Lee <juhui24.lee@samsung.com> +Julien Brianceau <jbriance@cisco.com> +Julien Isorce <j.isorce@samsung.com> +Julien Racle <jracle@logitech.com> +Jun Fang <jun_fang@foxitsoftware.com> +Jun Jiang <jun.a.jiang@intel.com> +Junchao Han <junchao.han@intel.com> +Junghoon Lee <sjh836@gmail.com> +Junghyuk Yoo <wjdgurdl272@gmail.com> +JungJik Lee <jungjik.lee@samsung.com> +Jungkee Song <jungkee.song@samsung.com> +Junmin Zhu <junmin.zhu@intel.com> +Justin Okamoto <justmoto@amazon.com> +Justin Ribeiro <justin@justinribeiro.com> +Jüri Valdmann <juri.valdmann@qt.io> +Kai Jiang <jiangkai@gmail.com> +Kai Köhne <kai.koehne@qt.io> +Kai Uwe Broulik <kde@privat.broulik.de> +Kal Conley <kcconley@gmail.com> +Kalyan Kondapally <kalyan.kondapally@intel.com> +Kamil Jiwa <kamil.jiwa@gmail.com> +Kamil Rytarowski <krytarowski@gmail.com> +Kangil Han <kangil.han@samsung.com> +Kangyuan Shu <kangyuan.shu@intel.com> +Karan Thakkar <karanjthakkar@gmail.com> +Kartikey Bhatt <kartikey@amazon.com> +Kaspar Brand <googlecontrib@velox.ch> +Kaustubh Atrawalkar <kaustubh.a@samsung.com> +Kaustubh Atrawalkar <kaustubh.ra@gmail.com> +Ke He <ke.he@intel.com> +Keene Pan <keenepan@linpus.com> +Keita Yoshimoto <y073k3@gmail.com> +Keith Chen <keitchen@amazon.com> +Kenneth Rohde Christiansen <kenneth.r.christiansen@intel.com> +Kenneth Strickland <ken.strickland@gmail.com> +Kenneth Zhou <knthzh@gmail.com> +Keonho Kim <keonho07.kim@samsung.com> +Ketan Goyal <ketan.goyal@samsung.com> +Kevin Gibbons <bakkot@gmail.com> +Kevin Lee Helpingstine <sig11@reprehensible.net> +Kevin M. McCormick <mckev@amazon.com> +Khasim Syed Mohammed <khasim.mohammed@linaro.org> +Kihong Kwon <kihong.kwon@samsung.com> +Kihoon Ko <rhrlgns777@gmail.com> +Kihwang Kim <pwangkk@gmail.com> +Kim Christensen <kimworking@gmail.com> +Kimberly Hunter <kimberhu@amazon.com> +Kingshuk Jana <kingshuk.j@samsung.com> +Kirill Bobyrev <kirillbobyrev@gmail.com> +Kirill Ovchinnikov <kirill.ovchinn@gmail.com> +Klemen Forstnerič <klemen.forstneric@gmail.com> +Kodam Nagaraju <k2.nagaraju@samsung.com> +Konrad Dzwinel <kdzwinel@gmail.com> +Krishna Chaitanya <krish.botta@samsung.com> +Kristof Kosztyo <kkosztyo.u-szeged@partner.samsung.com> +Krzysztof Czech <k.czech@samsung.com> +Krzysztof Wolanski <k.wolanski@samsung.com> +Kui Tan <tk1061178@gmail.com> +Kunal Thakar <kunalt@gmail.com> +Kushal Pisavadia <kushi.p@gmail.com> +Kwangho Shin <k_h.shin@samsung.com> +Kyle Nahrgang <kpn24@drexel.edu> +Kyle Plumadore <kyle.plumadore@amd.com> +Kyounga Ra <kyounga.ra@gmail.com> +Kyoungdeok Kwon <kkd927@gmail.com> +Kyung Yeol Kim <chitacan@gmail.com> +Kyungtae Kim <ktf.kim@samsung.com> +Kyungyoung Heo <bbvch13531@gmail.com> +Lalit Chandivade <lalit.chandivade@einfochips.com> +Laszlo Gombos <l.gombos@samsung.com> +Laszlo Radanyi <bekkra@gmail.com> +Lauren Yeun Kim <lauren.yeun.kim@gmail.com> +Lauri Oherd <lauri.oherd@gmail.com> +Lavar Askew <open.hyperion@gmail.com> +Legend Lee <guanxian.li@intel.com> +Leith Bade <leith@leithalweapon.geek.nz> +Lenny Khazan <lenny.khazan@gmail.com> +Leo Wolf <jclw@ymail.com> +Leon Han <leon.han@intel.com> +Leung Wing Chung <lwchkg@gmail.com> +Li Yin <li.yin@intel.com> +Lidwine Genevet <lgenevet@cisco.com> +Lin Sun <lin.sun@intel.com> +Lingyun Cai <lingyun.cai@intel.com> +Lionel Landwerlin <lionel.g.landwerlin@intel.com> +Lizhi Fan <lizhi.fan@samsung.com> +Loo Rong Jie <loorongjie@gmail.com> +Lorenzo Stoakes <lstoakes@gmail.com> +Lu Guanqun <guanqun.lu@gmail.com> +Luca Di Domenico <luca94dd@gmail.com> +Lucie Brozkova <lucinka.brozkova@gmail.com> +Luiz Von Dentz <luiz.von.dentz@intel.com> +Luka Dojcilovic <l.dojcilovic@gmail.com> +Luke Inman-Semerau <luke.semerau@gmail.com> +Luke Zarko <lukezarko@gmail.com> +Luoxi Pan <l.panpax@gmail.com> +Maarten Lankhorst <m.b.lankhorst@gmail.com> +Magnus Danielsson <fuzzac@gmail.com> +Mahesh Kulkarni <mahesh.kk@samsung.com> +Mahesh Machavolu <mahesh.ma@samsung.com> +Maksim Kolesin <mkolesin@gmail.com> +Maksim Sisov <maksim.sisov@intel.com> +Malcolm Wang <malcolm.2.wang@gmail.com> +Mallikarjuna Rao V <vm.arjun@samsung.com> +Manish Chhajer <chhajer.m@samsung.com> +Manish Jethani <m.jethani@eyeo.com> +Manojkumar Bhosale <manojkumar.bhosale@imgtec.com> +Manuel Braun <thembrown@gmail.com> +Mao Yujie <maojie0924@gmail.com> +Mao Yujie <yujie.mao@intel.com> +Marc des Garets <marc.desgarets@googlemail.com> +Marcin Wiacek <marcin@mwiacek.com> +Marco Rodrigues <gothicx@gmail.com> +Mario Pistrich <m.pistrich@gmail.com> +Mario Sanchez Prada <mario.prada@samsung.com> +Mariusz Mlynski <marius.mlynski@gmail.com> +Mark Hahnenberg <mhahnenb@andrew.cmu.edu> +Mark Seaborn <mrs@mythic-beasts.com> +Martijn Croonen <martijn@martijnc.be> +Martin Bednorz <m.s.bednorz@gmail.com> +Martin Rogalla <martin@martinrogalla.com> +Martina Kollarova <martina.kollarova@intel.com> +Masahiro Yado <yado.masa@gmail.com> +Masaru Nishida <msr.i386@gmail.com> +Matej Knopp <matej.knopp@gmail.com> +Matheus Bratfisch <matheusbrat@gmail.com> +Mathias Bynens <mathias@qiwi.be> +Mathieu Meisser <mmeisser@logitech.com> +Matt Arpidone <mma.public@gmail.com> +Matt Strum <mstrum@amazon.com> +Matt Zeunert <matt@mostlystatic.com> +Matthew Bauer <mjbauer95@gmail.com> +Matthew Demarest <demarem@amazon.com> +Matthew Robertson <matthewrobertson03@gmail.com> +Matthew Turk <matthewturk@gmail.com> +Matthew Willis <appamatto@gmail.com> +Matthias Reitinger <reimarvin@gmail.com> +Matthieu Rigolot <matthieu.rigolot@gmail.com> +Max Perepelitsyn <pph34r@gmail.com> +Max Vujovic <mvujovic@adobe.com> +Mayank Gupta <mayank.g1@samsung.com> +Mayur Kankanwadi <mayurk.vk@samsung.com> +Md Abdullah Al Alamin <a.alamin.cse@gmail.com> +Md. Hasanur Rashid <hasanur.r@samsung.com> +Md Jobed Hossain <jrony15@gmail.com> +Md Sami Uddin <md.sami@samsung.com> +Michael Cirone <mikecirone@gmail.com> +Michael Gilbert <floppymaster@gmail.com> +Michael Lopez <lopes92290@gmail.com> +Michael Morrison <codebythepound@gmail.com> +Michael Müller <michael@fds-team.de> +Michael Schechter <mike.schechter@gmail.com> +Michaël Zasso <mic.besace@gmail.com> +Michael Zugelder <michael@zugelder.org> +Michel Promonet <michel.promonet.1@gmail.com> +Mihai Maerean <mmaerean@adobe.com> +Mihai Tica <mihai.o.tica@gmail.com> +Mihai Tica <mitica@adobe.com> +Mike Pennisi <mike@mikepennisi.com> +Mike Tilburg <mtilburg@adobe.com> +Mikhail Pozdnyakov <mikhail.pozdnyakov@intel.com> +Milko Leporis <milko.leporis@imgtec.com> +Milton Chiang <milton.chiang@mediatek.com> +Minggang Wang <minggang.wang@intel.com> +Mingmin Xie <melvinxie@gmail.com> +Minjeong Lee <apenr1234@gmail.com> +Minseok Koo <kei98301@gmail.com> +Minsoo Max Koo <msu.koo@samsung.com> +Miran Karic <miran.karic@imgtec.com> +Mirela Budaes <mbudaes@adobe.com> +Mirela Budaes <mbudaes@gmail.com> +Miyoung Shin <myid.shin@navercorp.com> +Mohamed I. Hammad <ibraaaa@gmail.com> +Mohamed Mansour <m0.interactive@gmail.com> +Mohammad Azam <m.azam@samsung.com> +Mohammed Wajahat Ali Siddiqui <wajahat.s@samsung.com> +Mohan Reddy <mohan.reddy@samsung.com> +Mohit Bhalla <bhallam@amazon.com> +Momoko Hattori <momohatt10@gmail.com> +Mostafa Sedaghat joo <mostafa.sedaghat@gmail.com> +Mrunal Kapade <mrunal.kapade@intel.com> +Myeongjin Cho <myeongjin.cho@navercorp.com> +Myles C. Maxfield <mymax@amazon.com> +Myung-jong Kim <mjkim610@gmail.com> +Nagarajan Narayanan <nagarajan.n@samsung.com> +Nagarjuna Atluri <nagarjuna.a@samsung.com> +Naiem Shaik <naiem.shaik@gmail.com> +Naoki Takano <takano.naoki@gmail.com> +Naveen Bobbili <naveenbobbili@motorola.com> +Naveen Bobbili <qghc36@motorola.com> +Naveen Kumar Devaraj <devarajn@amazon.com> +Naveen Kumar S G <naveensg@samsung.com> +Nayan Kumar K <qtc746@motorola.com> +Neal Gompa <ngompa13@gmail.com> +Ned Williamson <nedwilliamson@gmail.com> +Nedeljko Babic <nedeljko.babic@imgtec.com> +Nikhil Bansal <n.bansal@samsung.com> +Nikhil Sahni <nikhil.sahni@samsung.com> +Nikita Ofitserov <himikof@gmail.com> +Niklas Hambüchen <mail@nh2.me> +Niklas Schulze <me@jns.io> +Nikola Kovacs <nikola.kovacs@gmail.com> +Nils Schneider <nils.schneider@gmail.com> +Nils Schneider <nils@nilsschneider.net> +Ningxin Hu <ningxin.hu@intel.com> +Nitish Mehrotra <nitish.m@samsung.com> +Noj Vek <nojvek@gmail.com> +Nolan Cao <nolan.robin.cao@gmail.com> +Oleksii Kadurin <ovkadurin@gmail.com> +Oliver Dunk <oliver@oliverdunk.com> +Olli Raula (Old name Olli Syrjälä) <olli.raula@intel.com> +Omar Sandoval <osandov@osandov.com> +Pan Deng <pan.deng@intel.com> +Parag Radke <nrqv63@motorola.com> +Paritosh Kumar <paritosh.in@samsung.com> +Patrasciuc Sorin Cristian <cristian.patrasciuc@gmail.com> +Patrick Chan <chanpatorikku@gmail.com> +Patrick Kettner <patrickkettner@gmail.com> +Patrick Riordan <patrickriordan177@gmail.com> +Patrick Stein <patrickwonders@gmail.com> +Patrik Ackland <patrikackland@gmail.com> +Paul Adolph <padolph@netflix.com> +Paul Kehrer <paul.l.kehrer@gmail.com> +Paul Lind <paul.lind@imgtec.com> +Paul Nettleship <pnettleship@gmail.com> +Paul Robinson <paulrobinson85@googlemail.com> +Paul Roskell <blurrech@gmail.com> +Paul Sapunaru <paul.sapunaru@intel.com> +Paul Wicks <pwicks86@gmail.com> +Pavan Kumar Emani <pavan.e@samsung.com> +Pavel Golikov <paullo612@ya.ru> +Pavel Ivanov <paivanof@gmail.com> +Pawel Forysiuk <p.forysiuk@samsung.com> +Paweł Hajdan jr <phajdan.jr@gmail.com> +Payal Pandey <payal.pandey@samsung.com> +Peng Hu <penghu@tencent.com> +Peng Jiang <leiyi.jp@gmail.com> +Peng Xinchao <pxinchao@gmail.com> +Peter Bright <drpizza@quiscalusmexicanus.org> +Peter Brophy <pbrophy@adobe.com> +Peter Collingbourne <peter@pcc.me.uk> +Peter Gal <pgal.u-szeged@partner.samsung.com> +Peter Griffin <peter.griffin@linaro.org> +Peter Molnar <pmolnar.u-szeged@partner.samsung.com> +Peter Snyder <snyderp@gmail.com> +Peter Wong <peter.wm.wong@gmail.com> +Philip Hanson <philip.hanson@intel.com> +Philipp Hancke <fippo@andyet.net> +Philipp Hancke <philipp.hancke@googlemail.com> +Philippe Beauchamp <philippe.beauchamp@gmail.com> +Philippe Beaudoin <philippe.beaudoin@gmail.com> +PhistucK <phistuck@gmail.com> +Pierre Neter <pierreneter@gmail.com> +Pierre-Antoine LaFayette <pierre.lafayette@gmail.com> +Po-Chun Chang <pochang0403@gmail.com> +Pramod Begur Srinath <pramod.bs@samsung.com> +Pranay Kumar <pranay.kumar@samsung.com> +Pranjal Jumde <pranjal@brave.com> +Prashant Hiremath <prashhir@cisco.com> +Prashant Nevase <prashant.n@samsung.com> +Prashant Patil <prashant.patil@imgtec.com> +Praveen Akkiraju <praveen.anp@samsung.com> +Preeti Nayak <preeti.nayak@samsung.com> +Pritam Nikam <pritam.nikam@samsung.com> +Puttaraju R <puttaraju.r@samsung.com> +Qi Yang <qi1988.yang@samsung.com> +Qiankun Miao <qiankun.miao@intel.com> +Qing Zhang <qing.zhang@intel.com> +Radu Stavila <stavila@adobe.com> +Radu Velea <radu.velea@intel.com> +Rafael Antognolli <rafael.antognolli@intel.com> +Raghavendra Ghatage <r.ghatage@samsung.com> +Raghu Ram Nagaraj <r.nagaraj@samsung.com> +Rahul Gupta <rahul.g@samsung.com> +Rajneesh Rana <rajneesh.r@samsung.com> +Raman Tenneti <raman.tenneti@gmail.com> +Ramkumar Gokarnesan <ramkumar.gokarnesan@gmail.com> +Ramkumar Ramachandra <artagnon@gmail.com> +Ramya Vadlamudi <ramya.v@samsung.com> +Randy Posynick <randy.posynick@gmail.com> +Raphael Kubo da Costa <raphael.kubo.da.costa@intel.com> +Raul Tambre <raul@tambre.ee> +Raveendra Karu <r.karu@samsung.com> +Ravi Nanjundappa <nravi.n@samsung.com> +Ravi Phaneendra Kasibhatla <r.kasibhatla@samsung.com> +Ravi Phaneendra Kasibhatla <ravi.kasibhatla@motorola.com> +Raviraj Sitaram <raviraj.p.sitaram@intel.com> +Réda Housni Alaoui <alaoui.rda@gmail.com> +Refael Ackermann <refack@gmail.com> +Renata Hodovan <rhodovan.u-szeged@partner.samsung.com> +Rene Bolldorf <rb@radix.io> +Rene Ladan <r.c.ladan@gmail.com> +Richard Baranyi <lordprotector@gmail.com> +Richard Li <richard.li@intel.com> +Rijubrata Bhaumik <rijubrata.bhaumik@intel.com> +Riku Voipio <riku.voipio@linaro.org> +Rob Buis <rob.buis@samsung.com> +Rob Wu <rob@robwu.nl> +Robert Bear Travis <bear.travis@gmail.com> +Robert Bear Travis <betravis@adobe.com> +Robert Bradford <robert.bradford@intel.com> +Robert Goldberg <goldberg@adobe.com> +Robert Hogan <robhogan@gmail.com> +Robert Nagy <robert.nagy@gmail.com> +Robert Sesek <rsesek@bluestatic.org> +Roland Takacs <rtakacs.u-szeged@partner.samsung.com> +Romain Pokrzywka <romain.pokrzywka@gmail.com> +Rosen Dash <nqk836@motorola.com> +Rosen Dash <rosen.dash@gmail.com> +Ross Kirsling <rkirsling@gmail.com> +ruben <chromium@hybridsource.org> +Ruben Bridgewater <ruben@bridgewater.de> +Ruben Terrazas <rubentopo@gmail.com> +Rufus Hamade <rufus.hamade@imgtec.com> +Ruiyi Luo <luoruiyi2008@gmail.com> +Ryan Ackley <ryanackley@gmail.com> +Ryan Norton <rnorton10@gmail.com> +Ryan Sleevi <ryan-chromium-dev@sleevi.com> +Ryan Yoakum <ryoakum@skobalt.com> +Ryuan Choi <ryuan.choi@samsung.com> +Saikrishna Arcot <saiarcot895@gmail.com> +Sajal Khandelwal <skhandelwa22@bloomberg.net> +Salvatore Iovene <salvatore.iovene@intel.com> +Sam Larison <qufighter@gmail.com> +Sam McDonald <sam@sammcd.com> +Samuel Attard <samuel.r.attard@gmail.com> +Sanggi Hong <sanggi.hong11@gmail.com> +Sanghee Lee <sanghee.lee1992@gmail.com> +Sanghyun Park <sh919.park@samsung.com> +Sanghyup Lee <sh53.lee@samsung.com> +Sangjoon Je <htamop@gmail.com> +Sangseok Jang <sangseok.jang@navercorp.com> +Sangwoo Ko <sangwoo.ko@navercorp.com> +Sangwoo Ko <sangwoo108@gmail.com> +Sanjoy Pal <ncj674@motorola.com> +Sanjoy Pal <sanjoy.pal@samsung.com> +Sanne Wouda <sanne.wouda@gmail.com> +Santosh Mahto <samahto@cisco.com> +Sarath Singapati <s.singapati@gmail.com> +Sarath Singapati <s.singapati@samsung.com> +Sarath Singapati <sarath.singapati@huawei.com> +Saravanan KR <sramajay@cisco.com> +Sathish Kuppuswamy <sathish.kuppuswamy@intel.com> +Satoshi Matsuzaki <satoshi.matsuzaki@gmail.com> +Satyajit Sahu <satyajit.sahu@amd.com> +Sayan Nayak <sayan.nayak@samsung.com> +Scott D Phillips <scott.d.phillips@intel.com> +Sean Bryant <sean@cyberwang.net> +Sean DuBois <seaduboi@amazon.com> +Sebastian Amend <sebastian.amend@googlemail.com> +Sebastian Krzyszkowiak <dos@dosowisko.net> +Seo Sanghyeon <sanxiyn@gmail.com> +Seokju Kwon <seokju.kwon@gmail.com> +SeongTae Jeong <ferendevelop.gl@gmail.com> +Sergey Kipet <sergey.kipet@gmail.com> +Sergey Putilin <p.sergey@samsung.com> +Sergey Shekyan <shekyan@gmail.com> +Sergio Carlos Morales Angeles <carloschilazo@gmail.com> +Sergiy Belozorov <rryk.ua@gmail.com> +Seshadri Mahalingam <seshadri.mahalingam@gmail.com> +Seungkyu Lee <zx6658@gmail.com> +Sevan Janiyan <venture37@geeklan.co.uk> +Shahriar Rostami <shahriar.rostami@gmail.com> +Shail Singhal <shail.s@samsung.com> +Shane Hansen <shanemhansen@gmail.com> +ShankarGanesh K <blr.bmlab@gmail.com> +Shanmuga Pandi M <shanmuga.m@samsung.com> +Shaobo Yan <shaobo.yan@intel.com> +Shashi Kumar <sk.kumar@samsung.com> +Shawn Anastasio <shawnanastasio@gmail.com> +Shelley Vohr <shelley.vohr@gmail.com> +Shen Yu <shenyu.tcv@gmail.com> +Sherry Mou <wenjinm@amazon.com> +Shez Baig <sbaig1@bloomberg.net> +Shigeki Ohtsu <shigeki.ohtsu@gmail.com> +Shiliu Wang <aofdwsl@gmail.com> +Shiliu Wang <shiliu.wang@intel.com> +Shilpa Shri <shilpa.shri@samsung.com> +Shirish S <shirish.s@amd.com> +Shiva Kumar <shiva.k1@samsung.com> +Shivakumar JM <shiva.jm@samsung.com> +Shouqun Liu <liushouqun@xiaomi.com> +Shouqun Liu <shouqun.liu@intel.com> +Shreeram Kushwaha <shreeram.k@samsung.com> +Shreyas Gopal <shreyas.g@samsung.com> +Shreyas VA <v.a.shreyas@gmail.com> +Shubham Agrawal <shubag@amazon.com> +Siba Samal <siba.samal@samsung.com> +Siddharth Bagai <b.siddharth@samsung.com> +Siddharth Shankar <funkysidd@gmail.com> +Simon Arlott <simon.arlott@gmail.com> +Simon La Macchia <smacchia@amazon.com> +Siva Kumar Gunturi <siva.gunturi@samsung.com> +Sohan Jyoti Ghosh <sohan.jyoti@huawei.com> +Sohan Jyoti Ghosh <sohan.jyoti@samsung.com> +Song YeWen <ffmpeg@gmail.com> +Sooho Park <sooho1000@gmail.com> +Soojung Choi <crystal2840@gmail.com> +Soorya R <soorya.r@samsung.com> +Soren Dreijer <dreijerbit@gmail.com> +Sreerenj Balachandran <sreerenj.balachandran@intel.com> +Srirama Chandra Sekhar Mogali <srirama.m@samsung.com> +Staphany Park <stapark008@gmail.com> +Stephen Searles <stephen.searles@gmail.com> +Steve Sanders <steve@zanderz.com> +Steven Pennington <spenn@engr.uvic.ca> +Steven Roussey <sroussey@gmail.com> +Subrahmanya Praveen Munukutla <sataya.m@samsung.com> +Suchit Agrawal <a.suchit@samsung.com> +Sudarsana Babu Nagineni <sudarsana.nagineni@intel.com> +Sudarshan Parthasarathy <sudarshan.p@samsung.com> +Sujae Jo <sujae33.jo@gmail.com> +Sujith S S <sujiths.s@samsung.com> +Sunchang Li <johnstonli@tencent.com> +Suneel Kota <suneel.kota@samsung.com> +Sungguk Lim <limasdf@gmail.com> +Sungmann Cho <sungmann.cho@gmail.com> +Sungmann Cho <sungmann.cho@navercorp.com> +Sunil Ratnu <sunil.ratnu@samsung.com> +Sunitha Srivatsa <srivats@amazon.com> +Suvanjan Mukherjee <suvanjanmukherjee@gmail.com> +Suyambulingam R M <suyambu.rm@samsung.com> +Suyash Sengar <suyash.s@samsung.com> +Swarali Raut <swarali.sr@samsung.com> +Swati Jaiswal <swa.jaiswal@samsung.com> +Sylvain Zimmer <sylvinus@gmail.com> +Sylvestre Ledru <sylvestre.ledru@gmail.com> +Synthia Islam <synthia.is@samsung.com> +Szabolcs David <davidsz@inf.u-szeged.hu> +Szymon Piechowicz <szymonpiechowicz@o2.pl> +Taeheon Kim <skyrabbits1@gmail.com> +Taehoon Lee <taylor.hoon@gmail.com> +Takashi Fujita <tgfjt.mail@gmail.com> +Takeshi Kurosawa <taken.spc@gmail.com> +Tanay Chowdhury <tanay.c@samsung.com> +Tanvir Rizvi <tanvir.rizvi@samsung.com> +Tapu Kumar Ghose <ghose.tapu@gmail.com> +Taylor Price <trprice@gmail.com> +Ted Kim <neot0000@gmail.com> +Ted Vessenes <tedvessenes@gmail.com> +Teodora Novkovic <teodora.petrovic@gmail.com> +Thiago Farina <thiago.farina@gmail.com> +Thiago Marcos P. Santos <thiago.santos@intel.com> +Thomas Butter <tbutter@gmail.com> +Thomas Conti <tomc@amazon.com> +Thomas White <im.toms.inbox@gmail.com> +Tiago Vignatti <tiago.vignatti@intel.com> +Tibor Dusnoki <tibor.dusnoki.91@gmail.com> +Tim Ansell <mithro@mithis.com> +Tim Niederhausen <tim@rnc-ag.de> +Timo Gurr <timo.gurr@gmail.com> +Timo Reimann <ttr314@googlemail.com> +Timo Witte <timo.witte@gmail.com> +Ting Shao <ting.shao@intel.com> +Tom Callaway <tcallawa@redhat.com> +Tom Harwood <tfh@skip.org> +Tomas Popela <tomas.popela@gmail.com> +Torsten Kurbad <google@tk-webart.de> +Trent Willis <trentmwillis@gmail.com> +Trevor Perrin <unsafe@trevp.net> +Tripta Gupta <tripta.g@samsung.com> +U. Artie Eoff <ullysses.a.eoff@intel.com> +Umar Hansa <umar.hansa@gmail.com> +Upendra Gowda <upendrag.gowda@gmail.com> +Uzair Jaleel <uzair.jaleel@samsung.com> +Vadim Gorbachev <bmsdave@gmail.com> +Vaibhav Agrawal <vaibhav1.a@samsung.com> +Valentin Ilie <valentin.ilie@intel.com> +Vamshikrishna Yellenki <vamshi@motorola.com> +Vani Hegde <vani.hegde@samsung.com> +Varun Chowdhary Paturi <v.paturi@samsung.com> +Vartul Katiyar <vartul.k@samsung.com> +Vedran Šajatović <vedran.sajatovic@gmail.com> +Vernon Tang <vt@foilhead.net> +Viatcheslav Ostapenko <sl.ostapenko@samsung.com> +Victor Costan <costan@gmail.com> +Viet-Trung Luu <viettrungluu@gmail.com> +Vinay Anantharaman <vinaya@adobe.com> +Vipul Bhasin <vipul.bhasin@gmail.com> +Visa Putkinen <v.putkinen@partner.samsung.com> +Vishal Bhatnagar <vishal.b@samsung.com> +Vitaliy Kharin <kvserr@gmail.com> +Vivek Galatage <vivek.vg@samsung.com> +Volker Sorge <volker.sorge@gmail.com> +Waihung Fu <fufranci@amazon.com> +Wanming Lin <wanming.lin@intel.com> +Wei Li <wei.c.li@intel.com> +WenSheng He <wensheng.he@samsung.com> +Wesley Lancel <wesleylancel@gmail.com> +Wesley Wigham <wwigham@gmail.com> +Will Hirsch <chromium@willhirsch.co.uk> +Will Shackleton <w.shackleton@gmail.com> +William Xie <william.xie@intel.com> +Xiang Long <xiang.long@intel.com> +Xiangze Zhang <xiangze.zhang@intel.com> +Xiaofeng Zhang <xiaofeng.zhang@intel.com> +Xiaolei Yu <dreifachstein@gmail.com> +Xiaoshu Zhang <xiaoshu@amazon.com> +Xiaoyin Liu <xiaoyin.l@outlook.com> +Xinchao He <hexinchao@gmail.com> +Xing Zhang <xzhang@adobe.com> +Xinghua Cao <xinghua.cao@intel.com> +Xu Samuel <samuel.xu@intel.com> +Xu Xing <xing.xu@intel.com> +Xuefei Ren <xrenishere@gmail.com> +Xueqing Huang <huangxueqing@xiaomi.com> +Xun Sun <xun.sun@intel.com> +Xunran Ding <xunran.ding@samsung.com> +Xunran Ding <dingxunran@gmail.com> +Yael Aharon <yael.aharon@intel.com> +Yan Wang <yan0422.wang@samsung.com> +Yang Gu <yang.gu@intel.com> +Yannic Bonenberger <contact@yannic-bonenberger.com> +Yarin Kaul <yarin.kaul@gmail.com> +Yash Vempati <vempatiy@amazon.com> +Ye Liu <cbakgly@gmail.com> +Yeol Park <peary2@gmail.com> +Yeonwoo Jo <yeonwoo.jo.92@gmail.com> +Yi Shen <yi.shen@samsung.com> +Yi Sun <ratsunny@gmail.com> +Yichen Jiang <jiangyichen123@gmail.com> +Yifei Yu <yuyifei@xiaomi.com> +Yizhou Jiang <yizhou.jiang@intel.com> +Yoav Weiss <yoav@yoav.ws> +Yoav Zilberberg <yoav.zilberberg@gmail.com> +Yong Shin <sy3620@gmail.com> +Yong Wang <ccyongwang@tencent.com> +Yongha Lee <yongha78.lee@samsung.com> +Yongseok Choi <yongseok.choi@navercorp.com> +Yongsheng Zhu <yongsheng.zhu@intel.com> +Yoonjae Cho <yoonjae.cho92@gmail.com> +Yoshinori Sano <yoshinori.sano@gmail.com> +Youngho Seo <hazivoo@gmail.com> +Youngjin Choi <cyjin9.yc@gmail.com> +YoungKi Hong <simon.hong81@gmail.com> +Youngmin Yoo <youngmin.yoo@samsung.com> +Youngsoo Choi <kenshin.choi@samsung.com> +Youngsun Suh <zard17@gmail.com> +Yuhong Sha <yuhong.sha@samsung.com> +Yumikiyo Osanai <yumios.art@gmail.com> +Yunchao He <yunchao.he@intel.com> +Yupei Lin <yplam@yplam.com> +Yupei Wang <perryuwang@tencent.com> +Yura Yaroshevich <yura.yaroshevich@gmail.com> +Yuri Gorobets <yuri.gorobets@gmail.com> +Yuriy Taraday <yorik.sar@gmail.com> +Yuvanesh Natarajan <yuvanesh.n1@samsung.com> +Zeno Albisser <zeno.albisser@digia.com> +Zeqin Chen <talonchen@tencent.com> +Zhaoze Zhou <zhaoze.zhou@partner.samsung.com> +Zheda Chen <zheda.chen@intel.com> +Zheng Chuang <zhengchuangscu@gmail.com> +Zhengkun Li <zhengkli@amazon.com> +Zhenyu Liang <zhenyu.liang@intel.com> +Zhenyu Shan <zhenyu.shan@intel.com> +Zhifei Fang <facetothefate@gmail.com> +Zhuoyu Qian <zhuoyu.qian@samsung.com> +Ziran Sun <ziran.sun@samsung.com> +Zoltan Herczeg <zherczeg.u-szeged@partner.samsung.com> +Zoltan Kuscsik <zoltan.kuscsik@linaro.org> +Zsolt Borbely <zsborbely.u-szeged@partner.samsung.com> +方觉 (Fang Jue) <fangjue23303@gmail.com> +Rajesh Mahindra <rmahindra@uber.com> +Yuan-Pin Yu <yjames@uber.com> +Vinoth Chandar <vinoth@uber.com> +Zheng Xu <zxu@kobo.com> +Junsong Li <ljs.darkfish@gmail.com> + +ACCESS CO., LTD. <*@access-company.com> +Akamai Inc. <*@akamai.com> +ARM Holdings <*@arm.com> +BlackBerry Limited <*@blackberry.com> +Bocoup <*@bocoup.com> +Canonical Limited <*@canonical.com> +Cloudflare, Inc. <*@cloudflare.com> +Code Aurora Forum <*@codeaurora.org> +Collabora Limited <*@collabora.com> +Comodo CA Limited +Cosium <*@cosium.com> +Duck Duck Go, Inc. <*@duckduckgo.com> +Endless Mobile, Inc. <*@endlessm.com> +Estimote, Inc. <*@estimote.com> +Facebook, Inc. <*@fb.com> +Facebook, Inc. <*@oculus.com> +Google Inc. <*@google.com> +Hewlett-Packard Development Company, L.P. <*@hp.com> +IBM Inc. <*@*.ibm.com> +IBM Inc. <*@ibm.com> +Igalia S.L. <*@igalia.com> +Imagination Technologies Limited <*@imagination.corp-partner.google.com> +Impossible Dreams Network <*@impossibledreams.net> +Intel Corporation <*@intel.com> +LG Electronics, Inc. <*@lge.com> +Loongson Technology Corporation Limited. <*@loongson.cn> +Macadamian <*@macadamian.com> +Mediatek <*@mediatek.com> +Microsoft <*@microsoft.com> +MIPS Technologies, Inc. <*@mips.com> +Mozilla Corporation <*@mozilla.com> +Neverware Inc. <*@neverware.com> +NIKE, Inc. <*@nike.com> +NVIDIA Corporation <*@nvidia.com> +Opera Software ASA <*@opera.com> +Optical Tone Ltd <*@opticaltone.com> +Pengutronix e.K. <*@pengutronix.de> +Rakuten Kobo Inc. <*@kobo.com> +Rakuten Kobo Inc. <*@rakuten.com> +Seznam.cz, a.s. <*@firma.seznam.cz> +Slack Technologies Inc. <*@slack-corp.com> +Spotify AB <*@spotify.com> +Tableau Software <*@tableau.com> +TeamSpeak Systems GmbH <*@teamspeak.com> +The Chromium Authors <*@chromium.org> +The MathWorks, Inc. <binod.pant@mathworks.com> +Torchmobile Inc. +Upwork <*@cloud.upwork.com> +Venture 3 Systems LLC <*@venture3systems.com> +Vewd Software AS <*@vewd.com> +Vivaldi Technologies AS <*@vivaldi.com> +Yandex LLC <*@yandex-team.ru> +Make Positive Provar Limited <*@provartesting.com>
diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a32e00c --- /dev/null +++ b/LICENSE
@@ -0,0 +1,27 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/WORKSPACE b/WORKSPACE new file mode 100644 index 0000000..2823b98 --- /dev/null +++ b/WORKSPACE
@@ -0,0 +1,5 @@ +# Copyright 2019 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +workspace(name = "com_google_googleurl")
diff --git a/base/BUILD b/base/BUILD new file mode 100644 index 0000000..a9ca0e6 --- /dev/null +++ b/base/BUILD
@@ -0,0 +1,21 @@ +# Copyright 2019 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +cc_library( + name = "base", + hdrs = [ + "compiler_specific.h", + "debug/leak_annotations.h", + "macros.h", + "no_destructor.h", + "optional.h", + "stl_util.h", + "template_util.h", + ], + visibility = ["//visibility:public"], + deps = [ + "//build:build_config", + "//polyfills", + ], +)
diff --git a/base/compiler_specific.h b/base/compiler_specific.h new file mode 100644 index 0000000..7e2c510 --- /dev/null +++ b/base/compiler_specific.h
@@ -0,0 +1,263 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_COMPILER_SPECIFIC_H_ +#define BASE_COMPILER_SPECIFIC_H_ + +#include "build/build_config.h" + +#if defined(COMPILER_MSVC) + +#if !defined(__clang__) +#error "Only clang-cl is supported on Windows, see https://crbug.com/988071" +#endif + +// Macros for suppressing and disabling warnings on MSVC. +// +// Warning numbers are enumerated at: +// http://msdn.microsoft.com/en-us/library/8x5x43k7(VS.80).aspx +// +// The warning pragma: +// http://msdn.microsoft.com/en-us/library/2c8f766e(VS.80).aspx +// +// Using __pragma instead of #pragma inside macros: +// http://msdn.microsoft.com/en-us/library/d9x1s805.aspx + +// MSVC_PUSH_DISABLE_WARNING pushes |n| onto a stack of warnings to be disabled. +// The warning remains disabled until popped by MSVC_POP_WARNING. +#define MSVC_PUSH_DISABLE_WARNING(n) __pragma(warning(push)) \ + __pragma(warning(disable:n)) + +// Pop effects of innermost MSVC_PUSH_* macro. +#define MSVC_POP_WARNING() __pragma(warning(pop)) + +#else // Not MSVC + +#define MSVC_PUSH_DISABLE_WARNING(n) +#define MSVC_POP_WARNING() +#define MSVC_DISABLE_OPTIMIZE() +#define MSVC_ENABLE_OPTIMIZE() + +#endif // COMPILER_MSVC + +// These macros can be helpful when investigating compiler bugs or when +// investigating issues in local optimized builds, by temporarily disabling +// optimizations for a single function or file. These macros should never be +// used to permanently work around compiler bugs or other mysteries, and should +// not be used in landed changes. +#if !defined(OFFICIAL_BUILD) +#if defined(__clang__) +#define DISABLE_OPTIMIZE() __pragma(clang optimize off) +#define ENABLE_OPTIMIZE() __pragma(clang optimize on) +#elif defined(COMPILER_MSVC) +#define DISABLE_OPTIMIZE() __pragma(optimize("", off)) +#define ENABLE_OPTIMIZE() __pragma(optimize("", on)) +#else +// These macros are not currently available for other compiler options. +#endif +// These macros are not available in official builds. +#endif // !defined(OFFICIAL_BUILD) + +// Annotate a variable indicating it's ok if the variable is not used. +// (Typically used to silence a compiler warning when the assignment +// is important for some other reason.) +// Use like: +// int x = ...; +// ALLOW_UNUSED_LOCAL(x); +#define ALLOW_UNUSED_LOCAL(x) (void)x + +// Annotate a typedef or function indicating it's ok if it's not used. +// Use like: +// typedef Foo Bar ALLOW_UNUSED_TYPE; +#if defined(COMPILER_GCC) || defined(__clang__) +#define ALLOW_UNUSED_TYPE __attribute__((unused)) +#else +#define ALLOW_UNUSED_TYPE +#endif + +// Annotate a function indicating it should not be inlined. +// Use like: +// NOINLINE void DoStuff() { ... } +#if defined(COMPILER_GCC) +#define NOINLINE __attribute__((noinline)) +#elif defined(COMPILER_MSVC) +#define NOINLINE __declspec(noinline) +#else +#define NOINLINE +#endif + +#if defined(COMPILER_GCC) && defined(NDEBUG) +#define ALWAYS_INLINE inline __attribute__((__always_inline__)) +#elif defined(COMPILER_MSVC) && defined(NDEBUG) +#define ALWAYS_INLINE __forceinline +#else +#define ALWAYS_INLINE inline +#endif + +// Specify memory alignment for structs, classes, etc. +// Use like: +// class ALIGNAS(16) MyClass { ... } +// ALIGNAS(16) int array[4]; +// +// In most places you can use the C++11 keyword "alignas", which is preferred. +// +// But compilers have trouble mixing __attribute__((...)) syntax with +// alignas(...) syntax. +// +// Doesn't work in clang or gcc: +// struct alignas(16) __attribute__((packed)) S { char c; }; +// Works in clang but not gcc: +// struct __attribute__((packed)) alignas(16) S2 { char c; }; +// Works in clang and gcc: +// struct alignas(16) S3 { char c; } __attribute__((packed)); +// +// There are also some attributes that must be specified *before* a class +// definition: visibility (used for exporting functions/classes) is one of +// these attributes. This means that it is not possible to use alignas() with a +// class that is marked as exported. +#if defined(COMPILER_MSVC) +#define ALIGNAS(byte_alignment) __declspec(align(byte_alignment)) +#elif defined(COMPILER_GCC) +#define ALIGNAS(byte_alignment) __attribute__((aligned(byte_alignment))) +#endif + +// Annotate a function indicating the caller must examine the return value. +// Use like: +// int foo() WARN_UNUSED_RESULT; +// To explicitly ignore a result, see |ignore_result()| in base/macros.h. +#undef WARN_UNUSED_RESULT +#if defined(COMPILER_GCC) || defined(__clang__) +#define WARN_UNUSED_RESULT __attribute__((warn_unused_result)) +#else +#define WARN_UNUSED_RESULT +#endif + +// Tell the compiler a function is using a printf-style format string. +// |format_param| is the one-based index of the format string parameter; +// |dots_param| is the one-based index of the "..." parameter. +// For v*printf functions (which take a va_list), pass 0 for dots_param. +// (This is undocumented but matches what the system C headers do.) +// For member functions, the implicit this parameter counts as index 1. +#if defined(COMPILER_GCC) || defined(__clang__) +#define PRINTF_FORMAT(format_param, dots_param) \ + __attribute__((format(printf, format_param, dots_param))) +#else +#define PRINTF_FORMAT(format_param, dots_param) +#endif + +// WPRINTF_FORMAT is the same, but for wide format strings. +// This doesn't appear to yet be implemented in any compiler. +// See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38308 . +#define WPRINTF_FORMAT(format_param, dots_param) +// If available, it would look like: +// __attribute__((format(wprintf, format_param, dots_param))) + +// Sanitizers annotations. +#if defined(__has_attribute) +#if __has_attribute(no_sanitize) +#define NO_SANITIZE(what) __attribute__((no_sanitize(what))) +#endif +#endif +#if !defined(NO_SANITIZE) +#define NO_SANITIZE(what) +#endif + +// MemorySanitizer annotations. +#if defined(MEMORY_SANITIZER) && !defined(OS_NACL) +#include <sanitizer/msan_interface.h> + +// Mark a memory region fully initialized. +// Use this to annotate code that deliberately reads uninitialized data, for +// example a GC scavenging root set pointers from the stack. +#define MSAN_UNPOISON(p, size) __msan_unpoison(p, size) + +// Check a memory region for initializedness, as if it was being used here. +// If any bits are uninitialized, crash with an MSan report. +// Use this to sanitize data which MSan won't be able to track, e.g. before +// passing data to another process via shared memory. +#define MSAN_CHECK_MEM_IS_INITIALIZED(p, size) \ + __msan_check_mem_is_initialized(p, size) +#else // MEMORY_SANITIZER +#define MSAN_UNPOISON(p, size) +#define MSAN_CHECK_MEM_IS_INITIALIZED(p, size) +#endif // MEMORY_SANITIZER + +// DISABLE_CFI_PERF -- Disable Control Flow Integrity for perf reasons. +#if !defined(DISABLE_CFI_PERF) +#if defined(__clang__) && defined(OFFICIAL_BUILD) +#define DISABLE_CFI_PERF __attribute__((no_sanitize("cfi"))) +#else +#define DISABLE_CFI_PERF +#endif +#endif + +// Macro useful for writing cross-platform function pointers. +#if !defined(CDECL) +#if defined(OS_WIN) +#define CDECL __cdecl +#else // defined(OS_WIN) +#define CDECL +#endif // defined(OS_WIN) +#endif // !defined(CDECL) + +// Macro for hinting that an expression is likely to be false. +#if !defined(UNLIKELY) +#if defined(COMPILER_GCC) || defined(__clang__) +#define UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +#define UNLIKELY(x) (x) +#endif // defined(COMPILER_GCC) +#endif // !defined(UNLIKELY) + +#if !defined(LIKELY) +#if defined(COMPILER_GCC) || defined(__clang__) +#define LIKELY(x) __builtin_expect(!!(x), 1) +#else +#define LIKELY(x) (x) +#endif // defined(COMPILER_GCC) +#endif // !defined(LIKELY) + +// Compiler feature-detection. +// clang.llvm.org/docs/LanguageExtensions.html#has-feature-and-has-extension +#if defined(__has_feature) +#define HAS_FEATURE(FEATURE) __has_feature(FEATURE) +#else +#define HAS_FEATURE(FEATURE) 0 +#endif + +// Macro for telling -Wimplicit-fallthrough that a fallthrough is intentional. +#if defined(__clang__) +#define FALLTHROUGH [[clang::fallthrough]] +#else +#define FALLTHROUGH +#endif + +#if defined(COMPILER_GCC) +#define PRETTY_FUNCTION __PRETTY_FUNCTION__ +#elif defined(COMPILER_MSVC) +#define PRETTY_FUNCTION __FUNCSIG__ +#else +// See https://en.cppreference.com/w/c/language/function_definition#func +#define PRETTY_FUNCTION __func__ +#endif + +#if !defined(CPU_ARM_NEON) +#if defined(__arm__) +#if !defined(__ARMEB__) && !defined(__ARM_EABI__) && !defined(__EABI__) && \ + !defined(__VFP_FP__) && !defined(_WIN32_WCE) && !defined(ANDROID) +#error Chromium does not support middle endian architecture +#endif +#if defined(__ARM_NEON__) +#define CPU_ARM_NEON 1 +#endif +#endif // defined(__arm__) +#endif // !defined(CPU_ARM_NEON) + +#if !defined(HAVE_MIPS_MSA_INTRINSICS) +#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5) +#define HAVE_MIPS_MSA_INTRINSICS 1 +#endif +#endif + +#endif // BASE_COMPILER_SPECIFIC_H_
diff --git a/base/debug/leak_annotations.h b/base/debug/leak_annotations.h new file mode 100644 index 0000000..dc50246 --- /dev/null +++ b/base/debug/leak_annotations.h
@@ -0,0 +1,46 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_DEBUG_LEAK_ANNOTATIONS_H_ +#define BASE_DEBUG_LEAK_ANNOTATIONS_H_ + +#include "base/macros.h" +#include "build/build_config.h" + +// This file defines macros which can be used to annotate intentional memory +// leaks. Support for annotations is implemented in LeakSanitizer. Annotated +// objects will be treated as a source of live pointers, i.e. any heap objects +// reachable by following pointers from an annotated object will not be +// reported as leaks. +// +// ANNOTATE_SCOPED_MEMORY_LEAK: all allocations made in the current scope +// will be annotated as leaks. +// ANNOTATE_LEAKING_OBJECT_PTR(X): the heap object referenced by pointer X will +// be annotated as a leak. + +#if defined(LEAK_SANITIZER) && !defined(OS_NACL) + +#include <sanitizer/lsan_interface.h> + +class ScopedLeakSanitizerDisabler { + public: + ScopedLeakSanitizerDisabler() { __lsan_disable(); } + ~ScopedLeakSanitizerDisabler() { __lsan_enable(); } + private: + DISALLOW_COPY_AND_ASSIGN(ScopedLeakSanitizerDisabler); +}; + +#define ANNOTATE_SCOPED_MEMORY_LEAK \ + ScopedLeakSanitizerDisabler leak_sanitizer_disabler; static_cast<void>(0) + +#define ANNOTATE_LEAKING_OBJECT_PTR(X) __lsan_ignore_object(X); + +#else + +#define ANNOTATE_SCOPED_MEMORY_LEAK ((void)0) +#define ANNOTATE_LEAKING_OBJECT_PTR(X) ((void)0) + +#endif + +#endif // BASE_DEBUG_LEAK_ANNOTATIONS_H_
diff --git a/base/macros.h b/base/macros.h new file mode 100644 index 0000000..cda8e3a --- /dev/null +++ b/base/macros.h
@@ -0,0 +1,44 @@ +// Copyright 2014 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// This file contains macros and macro-like constructs (e.g., templates) that +// are commonly used throughout Chromium source. (It may also contain things +// that are closely related to things that are commonly used that belong in this +// file.) + +#ifndef BASE_MACROS_H_ +#define BASE_MACROS_H_ + +// Put this in the declarations for a class to be uncopyable. +#define DISALLOW_COPY(TypeName) \ + TypeName(const TypeName&) = delete + +// Put this in the declarations for a class to be unassignable. +#define DISALLOW_ASSIGN(TypeName) TypeName& operator=(const TypeName&) = delete + +// Put this in the declarations for a class to be uncopyable and unassignable. +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + DISALLOW_COPY(TypeName); \ + DISALLOW_ASSIGN(TypeName) + +// A macro to disallow all the implicit constructors, namely the +// default constructor, copy constructor and operator= functions. +// This is especially useful for classes containing only static methods. +#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ + TypeName() = delete; \ + DISALLOW_COPY_AND_ASSIGN(TypeName) + +// Used to explicitly mark the return value of a function as unused. If you are +// really sure you don't want to do anything with the return value of a function +// that has been marked WARN_UNUSED_RESULT, wrap it with this. Example: +// +// std::unique_ptr<MyType> my_var = ...; +// if (TakeOwnership(my_var.get()) == SUCCESS) +// ignore_result(my_var.release()); +// +template<typename T> +inline void ignore_result(const T&) { +} + +#endif // BASE_MACROS_H_
diff --git a/base/no_destructor.h b/base/no_destructor.h new file mode 100644 index 0000000..3d7a85c --- /dev/null +++ b/base/no_destructor.h
@@ -0,0 +1,98 @@ +// Copyright 2018 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_NO_DESTRUCTOR_H_ +#define BASE_NO_DESTRUCTOR_H_ + +#include <new> +#include <utility> + +namespace gurl_base { + +// A wrapper that makes it easy to create an object of type T with static +// storage duration that: +// - is only constructed on first access +// - never invokes the destructor +// in order to satisfy the styleguide ban on global constructors and +// destructors. +// +// Runtime constant example: +// const std::string& GetLineSeparator() { +// // Forwards to std::string(size_t, char, const Allocator&) constructor. +// static const gurl_base::NoDestructor<std::string> s(5, '-'); +// return *s; +// } +// +// More complex initialization with a lambda: +// const std::string& GetSessionNonce() { +// static const gurl_base::NoDestructor<std::string> nonce([] { +// std::string s(16); +// crypto::RandString(s.data(), s.size()); +// return s; +// }()); +// return *nonce; +// } +// +// NoDestructor<T> stores the object inline, so it also avoids a pointer +// indirection and a malloc. Also note that since C++11 static local variable +// initialization is thread-safe and so is this pattern. Code should prefer to +// use NoDestructor<T> over: +// - A function scoped static T* or T& that is dynamically initialized. +// - A global gurl_base::LazyInstance<T>. +// +// Note that since the destructor is never run, this *will* leak memory if used +// as a stack or member variable. Furthermore, a NoDestructor<T> should never +// have global scope as that may require a static initializer. +template <typename T> +class NoDestructor { + public: + // Not constexpr; just write static constexpr T x = ...; if the value should + // be a constexpr. + template <typename... Args> + explicit NoDestructor(Args&&... args) { + new (storage_) T(std::forward<Args>(args)...); + } + + // Allows copy and move construction of the contained type, to allow + // construction from an initializer list, e.g. for std::vector. + explicit NoDestructor(const T& x) { new (storage_) T(x); } + explicit NoDestructor(T&& x) { new (storage_) T(std::move(x)); } + + NoDestructor(const NoDestructor&) = delete; + NoDestructor& operator=(const NoDestructor&) = delete; + + ~NoDestructor() = default; + + const T& operator*() const { return *get(); } + T& operator*() { return *get(); } + + const T* operator->() const { return get(); } + T* operator->() { return get(); } + + const T* get() const { return reinterpret_cast<const T*>(storage_); } + T* get() { return reinterpret_cast<T*>(storage_); } + + private: + alignas(T) char storage_[sizeof(T)]; + +#if defined(LEAK_SANITIZER) + // TODO(https://crbug.com/812277): This is a hack to work around the fact + // that LSan doesn't seem to treat NoDestructor as a root for reachability + // analysis. This means that code like this: + // static gurl_base::NoDestructor<std::vector<int>> v({1, 2, 3}); + // is considered a leak. Using the standard leak sanitizer annotations to + // suppress leaks doesn't work: std::vector is implicitly constructed before + // calling the gurl_base::NoDestructor constructor. + // + // Unfortunately, I haven't been able to demonstrate this issue in simpler + // reproductions: until that's resolved, hold an explicit pointer to the + // placement-new'd object in leak sanitizer mode to help LSan realize that + // objects allocated by the contained type are still reachable. + T* storage_ptr_ = reinterpret_cast<T*>(storage_); +#endif // defined(LEAK_SANITIZER) +}; + +} // namespace base + +#endif // BASE_NO_DESTRUCTOR_H_
diff --git a/base/optional.h b/base/optional.h new file mode 100644 index 0000000..345147c --- /dev/null +++ b/base/optional.h
@@ -0,0 +1,937 @@ +// Copyright 2016 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_OPTIONAL_H_ +#define BASE_OPTIONAL_H_ + +#include <functional> +#include <type_traits> +#include <utility> + +#include "polyfills/base/logging.h" +#include "base/template_util.h" + +namespace gurl_base { + +// Specification: +// http://en.cppreference.com/w/cpp/utility/optional/nullopt_t +struct nullopt_t { + constexpr explicit nullopt_t(int) {} +}; + +// Specification: +// http://en.cppreference.com/w/cpp/utility/optional/nullopt +constexpr nullopt_t nullopt(0); + +// Forward declaration, which is refered by following helpers. +template <typename T> +class Optional; + +namespace internal { + +template <typename T, bool = std::is_trivially_destructible<T>::value> +struct OptionalStorageBase { + // Initializing |empty_| here instead of using default member initializing + // to avoid errors in g++ 4.8. + constexpr OptionalStorageBase() : empty_('\0') {} + + template <class... Args> + constexpr explicit OptionalStorageBase(in_place_t, Args&&... args) + : is_populated_(true), value_(std::forward<Args>(args)...) {} + + // When T is not trivially destructible we must call its + // destructor before deallocating its memory. + // Note that this hides the (implicitly declared) move constructor, which + // would be used for constexpr move constructor in OptionalStorage<T>. + // It is needed iff T is trivially move constructible. However, the current + // is_trivially_{copy,move}_constructible implementation requires + // is_trivially_destructible (which looks a bug, cf: + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51452 and + // http://cplusplus.github.io/LWG/lwg-active.html#2116), so it is not + // necessary for this case at the moment. Please see also the destructor + // comment in "is_trivially_destructible = true" specialization below. + ~OptionalStorageBase() { + if (is_populated_) + value_.~T(); + } + + template <class... Args> + void Init(Args&&... args) { + GURL_DCHECK(!is_populated_); + ::new (&value_) T(std::forward<Args>(args)...); + is_populated_ = true; + } + + bool is_populated_ = false; + union { + // |empty_| exists so that the union will always be initialized, even when + // it doesn't contain a value. Union members must be initialized for the + // constructor to be 'constexpr'. + char empty_; + T value_; + }; +}; + +template <typename T> +struct OptionalStorageBase<T, true /* trivially destructible */> { + // Initializing |empty_| here instead of using default member initializing + // to avoid errors in g++ 4.8. + constexpr OptionalStorageBase() : empty_('\0') {} + + template <class... Args> + constexpr explicit OptionalStorageBase(in_place_t, Args&&... args) + : is_populated_(true), value_(std::forward<Args>(args)...) {} + + // When T is trivially destructible (i.e. its destructor does nothing) there + // is no need to call it. Implicitly defined destructor is trivial, because + // both members (bool and union containing only variants which are trivially + // destructible) are trivially destructible. + // Explicitly-defaulted destructor is also trivial, but do not use it here, + // because it hides the implicit move constructor. It is needed to implement + // constexpr move constructor in OptionalStorage iff T is trivially move + // constructible. Note that, if T is trivially move constructible, the move + // constructor of OptionalStorageBase<T> is also implicitly defined and it is + // trivially move constructor. If T is not trivially move constructible, + // "not declaring move constructor without destructor declaration" here means + // "delete move constructor", which works because any move constructor of + // OptionalStorage will not refer to it in that case. + + template <class... Args> + void Init(Args&&... args) { + GURL_DCHECK(!is_populated_); + ::new (&value_) T(std::forward<Args>(args)...); + is_populated_ = true; + } + + bool is_populated_ = false; + union { + // |empty_| exists so that the union will always be initialized, even when + // it doesn't contain a value. Union members must be initialized for the + // constructor to be 'constexpr'. + char empty_; + T value_; + }; +}; + +// Implement conditional constexpr copy and move constructors. These are +// constexpr if is_trivially_{copy,move}_constructible<T>::value is true +// respectively. If each is true, the corresponding constructor is defined as +// "= default;", which generates a constexpr constructor (In this case, +// the condition of constexpr-ness is satisfied because the base class also has +// compiler generated constexpr {copy,move} constructors). Note that +// placement-new is prohibited in constexpr. +template <typename T, + bool = is_trivially_copy_constructible<T>::value, + bool = std::is_trivially_move_constructible<T>::value> +struct OptionalStorage : OptionalStorageBase<T> { + // This is no trivially {copy,move} constructible case. Other cases are + // defined below as specializations. + + // Accessing the members of template base class requires explicit + // declaration. + using OptionalStorageBase<T>::is_populated_; + using OptionalStorageBase<T>::value_; + using OptionalStorageBase<T>::Init; + + // Inherit constructors (specifically, the in_place constructor). + using OptionalStorageBase<T>::OptionalStorageBase; + + // User defined constructor deletes the default constructor. + // Define it explicitly. + OptionalStorage() = default; + + OptionalStorage(const OptionalStorage& other) { + if (other.is_populated_) + Init(other.value_); + } + + OptionalStorage(OptionalStorage&& other) noexcept( + std::is_nothrow_move_constructible<T>::value) { + if (other.is_populated_) + Init(std::move(other.value_)); + } +}; + +template <typename T> +struct OptionalStorage<T, + true /* trivially copy constructible */, + false /* trivially move constructible */> + : OptionalStorageBase<T> { + using OptionalStorageBase<T>::is_populated_; + using OptionalStorageBase<T>::value_; + using OptionalStorageBase<T>::Init; + using OptionalStorageBase<T>::OptionalStorageBase; + + OptionalStorage() = default; + OptionalStorage(const OptionalStorage& other) = default; + + OptionalStorage(OptionalStorage&& other) noexcept( + std::is_nothrow_move_constructible<T>::value) { + if (other.is_populated_) + Init(std::move(other.value_)); + } +}; + +template <typename T> +struct OptionalStorage<T, + false /* trivially copy constructible */, + true /* trivially move constructible */> + : OptionalStorageBase<T> { + using OptionalStorageBase<T>::is_populated_; + using OptionalStorageBase<T>::value_; + using OptionalStorageBase<T>::Init; + using OptionalStorageBase<T>::OptionalStorageBase; + + OptionalStorage() = default; + OptionalStorage(OptionalStorage&& other) = default; + + OptionalStorage(const OptionalStorage& other) { + if (other.is_populated_) + Init(other.value_); + } +}; + +template <typename T> +struct OptionalStorage<T, + true /* trivially copy constructible */, + true /* trivially move constructible */> + : OptionalStorageBase<T> { + // If both trivially {copy,move} constructible are true, it is not necessary + // to use user-defined constructors. So, just inheriting constructors + // from the base class works. + using OptionalStorageBase<T>::OptionalStorageBase; +}; + +// Base class to support conditionally usable copy-/move- constructors +// and assign operators. +template <typename T> +class OptionalBase { + // This class provides implementation rather than public API, so everything + // should be hidden. Often we use composition, but we cannot in this case + // because of C++ language restriction. + protected: + constexpr OptionalBase() = default; + constexpr OptionalBase(const OptionalBase& other) = default; + constexpr OptionalBase(OptionalBase&& other) = default; + + template <class... Args> + constexpr explicit OptionalBase(in_place_t, Args&&... args) + : storage_(in_place, std::forward<Args>(args)...) {} + + // Implementation of converting constructors. + template <typename U> + explicit OptionalBase(const OptionalBase<U>& other) { + if (other.storage_.is_populated_) + storage_.Init(other.storage_.value_); + } + + template <typename U> + explicit OptionalBase(OptionalBase<U>&& other) { + if (other.storage_.is_populated_) + storage_.Init(std::move(other.storage_.value_)); + } + + ~OptionalBase() = default; + + OptionalBase& operator=(const OptionalBase& other) { + CopyAssign(other); + return *this; + } + + OptionalBase& operator=(OptionalBase&& other) noexcept( + std::is_nothrow_move_assignable<T>::value&& + std::is_nothrow_move_constructible<T>::value) { + MoveAssign(std::move(other)); + return *this; + } + + template <typename U> + void CopyAssign(const OptionalBase<U>& other) { + if (other.storage_.is_populated_) + InitOrAssign(other.storage_.value_); + else + FreeIfNeeded(); + } + + template <typename U> + void MoveAssign(OptionalBase<U>&& other) { + if (other.storage_.is_populated_) + InitOrAssign(std::move(other.storage_.value_)); + else + FreeIfNeeded(); + } + + template <typename U> + void InitOrAssign(U&& value) { + if (storage_.is_populated_) + storage_.value_ = std::forward<U>(value); + else + storage_.Init(std::forward<U>(value)); + } + + void FreeIfNeeded() { + if (!storage_.is_populated_) + return; + storage_.value_.~T(); + storage_.is_populated_ = false; + } + + // For implementing conversion, allow access to other typed OptionalBase + // class. + template <typename U> + friend class OptionalBase; + + OptionalStorage<T> storage_; +}; + +// The following {Copy,Move}{Constructible,Assignable} structs are helpers to +// implement constructor/assign-operator overloading. Specifically, if T is +// is not movable but copyable, Optional<T>'s move constructor should not +// participate in overload resolution. This inheritance trick implements that. +template <bool is_copy_constructible> +struct CopyConstructible {}; + +template <> +struct CopyConstructible<false> { + constexpr CopyConstructible() = default; + constexpr CopyConstructible(const CopyConstructible&) = delete; + constexpr CopyConstructible(CopyConstructible&&) = default; + CopyConstructible& operator=(const CopyConstructible&) = default; + CopyConstructible& operator=(CopyConstructible&&) = default; +}; + +template <bool is_move_constructible> +struct MoveConstructible {}; + +template <> +struct MoveConstructible<false> { + constexpr MoveConstructible() = default; + constexpr MoveConstructible(const MoveConstructible&) = default; + constexpr MoveConstructible(MoveConstructible&&) = delete; + MoveConstructible& operator=(const MoveConstructible&) = default; + MoveConstructible& operator=(MoveConstructible&&) = default; +}; + +template <bool is_copy_assignable> +struct CopyAssignable {}; + +template <> +struct CopyAssignable<false> { + constexpr CopyAssignable() = default; + constexpr CopyAssignable(const CopyAssignable&) = default; + constexpr CopyAssignable(CopyAssignable&&) = default; + CopyAssignable& operator=(const CopyAssignable&) = delete; + CopyAssignable& operator=(CopyAssignable&&) = default; +}; + +template <bool is_move_assignable> +struct MoveAssignable {}; + +template <> +struct MoveAssignable<false> { + constexpr MoveAssignable() = default; + constexpr MoveAssignable(const MoveAssignable&) = default; + constexpr MoveAssignable(MoveAssignable&&) = default; + MoveAssignable& operator=(const MoveAssignable&) = default; + MoveAssignable& operator=(MoveAssignable&&) = delete; +}; + +// Helper to conditionally enable converting constructors and assign operators. +template <typename T, typename U> +struct IsConvertibleFromOptional + : std::integral_constant< + bool, + std::is_constructible<T, Optional<U>&>::value || + std::is_constructible<T, const Optional<U>&>::value || + std::is_constructible<T, Optional<U>&&>::value || + std::is_constructible<T, const Optional<U>&&>::value || + std::is_convertible<Optional<U>&, T>::value || + std::is_convertible<const Optional<U>&, T>::value || + std::is_convertible<Optional<U>&&, T>::value || + std::is_convertible<const Optional<U>&&, T>::value> {}; + +template <typename T, typename U> +struct IsAssignableFromOptional + : std::integral_constant< + bool, + IsConvertibleFromOptional<T, U>::value || + std::is_assignable<T&, Optional<U>&>::value || + std::is_assignable<T&, const Optional<U>&>::value || + std::is_assignable<T&, Optional<U>&&>::value || + std::is_assignable<T&, const Optional<U>&&>::value> {}; + +// Forward compatibility for C++17. +// Introduce one more deeper nested namespace to avoid leaking using std::swap. +namespace swappable_impl { +using std::swap; + +struct IsSwappableImpl { + // Tests if swap can be called. Check<T&>(0) returns true_type iff swap + // is available for T. Otherwise, Check's overload resolution falls back + // to Check(...) declared below thanks to SFINAE, so returns false_type. + template <typename T> + static auto Check(int) + -> decltype(swap(std::declval<T>(), std::declval<T>()), std::true_type()); + + template <typename T> + static std::false_type Check(...); +}; +} // namespace swappable_impl + +template <typename T> +struct IsSwappable : decltype(swappable_impl::IsSwappableImpl::Check<T&>(0)) {}; + +// Forward compatibility for C++20. +template <typename T> +using RemoveCvRefT = std::remove_cv_t<std::remove_reference_t<T>>; + +} // namespace internal + +// On Windows, by default, empty-base class optimization does not work, +// which means even if the base class is empty struct, it still consumes one +// byte for its body. __declspec(empty_bases) enables the optimization. +// cf) +// https://blogs.msdn.microsoft.com/vcblog/2016/03/30/optimizing-the-layout-of-empty-base-classes-in-vs2015-update-2-3/ +#ifdef OS_WIN +#define OPTIONAL_DECLSPEC_EMPTY_BASES __declspec(empty_bases) +#else +#define OPTIONAL_DECLSPEC_EMPTY_BASES +#endif + +// gurl_base::Optional is a Chromium version of the C++17 optional class: +// std::optional documentation: +// http://en.cppreference.com/w/cpp/utility/optional +// Chromium documentation: +// https://chromium.googlesource.com/chromium/src/+/master/docs/optional.md +// +// These are the differences between the specification and the implementation: +// - Constructors do not use 'constexpr' as it is a C++14 extension. +// - 'constexpr' might be missing in some places for reasons specified locally. +// - No exceptions are thrown, because they are banned from Chromium. +// Marked noexcept for only move constructor and move assign operators. +// - All the non-members are in the 'base' namespace instead of 'std'. +// +// Note that T cannot have a constructor T(Optional<T>) etc. Optional<T> checks +// T's constructor (specifically via IsConvertibleFromOptional), and in the +// check whether T can be constructible from Optional<T>, which is recursive +// so it does not work. As of Feb 2018, std::optional C++17 implementation in +// both clang and gcc has same limitation. MSVC SFINAE looks to have different +// behavior, but anyway it reports an error, too. +template <typename T> +class OPTIONAL_DECLSPEC_EMPTY_BASES Optional + : public internal::OptionalBase<T>, + public internal::CopyConstructible<std::is_copy_constructible<T>::value>, + public internal::MoveConstructible<std::is_move_constructible<T>::value>, + public internal::CopyAssignable<std::is_copy_constructible<T>::value && + std::is_copy_assignable<T>::value>, + public internal::MoveAssignable<std::is_move_constructible<T>::value && + std::is_move_assignable<T>::value> { + private: + // Disable some versions of T that are ill-formed. + // See: https://timsong-cpp.github.io/cppwp/n4659/optional#syn-1 + static_assert( + !std::is_same<internal::RemoveCvRefT<T>, in_place_t>::value, + "instantiation of gurl_base::Optional with in_place_t is ill-formed"); + static_assert(!std::is_same<internal::RemoveCvRefT<T>, nullopt_t>::value, + "instantiation of gurl_base::Optional with nullopt_t is ill-formed"); + static_assert( + !std::is_reference<T>::value, + "instantiation of gurl_base::Optional with a reference type is ill-formed"); + // See: https://timsong-cpp.github.io/cppwp/n4659/optional#optional-3 + static_assert(std::is_destructible<T>::value, + "instantiation of gurl_base::Optional with a non-destructible type " + "is ill-formed"); + // Arrays are explicitly disallowed because for arrays of known bound + // is_destructible is of undefined value. + // See: https://en.cppreference.com/w/cpp/types/is_destructible + static_assert( + !std::is_array<T>::value, + "instantiation of gurl_base::Optional with an array type is ill-formed"); + + public: +#undef OPTIONAL_DECLSPEC_EMPTY_BASES + using value_type = T; + + // Defer default/copy/move constructor implementation to OptionalBase. + constexpr Optional() = default; + constexpr Optional(const Optional& other) = default; + constexpr Optional(Optional&& other) noexcept( + std::is_nothrow_move_constructible<T>::value) = default; + + constexpr Optional(nullopt_t) {} // NOLINT(runtime/explicit) + + // Converting copy constructor. "explicit" only if + // std::is_convertible<const U&, T>::value is false. It is implemented by + // declaring two almost same constructors, but that condition in enable_if_t + // is different, so that either one is chosen, thanks to SFINAE. + template < + typename U, + std::enable_if_t<std::is_constructible<T, const U&>::value && + !internal::IsConvertibleFromOptional<T, U>::value && + std::is_convertible<const U&, T>::value, + bool> = false> + Optional(const Optional<U>& other) : internal::OptionalBase<T>(other) {} + + template < + typename U, + std::enable_if_t<std::is_constructible<T, const U&>::value && + !internal::IsConvertibleFromOptional<T, U>::value && + !std::is_convertible<const U&, T>::value, + bool> = false> + explicit Optional(const Optional<U>& other) + : internal::OptionalBase<T>(other) {} + + // Converting move constructor. Similar to converting copy constructor, + // declaring two (explicit and non-explicit) constructors. + template < + typename U, + std::enable_if_t<std::is_constructible<T, U&&>::value && + !internal::IsConvertibleFromOptional<T, U>::value && + std::is_convertible<U&&, T>::value, + bool> = false> + Optional(Optional<U>&& other) : internal::OptionalBase<T>(std::move(other)) {} + + template < + typename U, + std::enable_if_t<std::is_constructible<T, U&&>::value && + !internal::IsConvertibleFromOptional<T, U>::value && + !std::is_convertible<U&&, T>::value, + bool> = false> + explicit Optional(Optional<U>&& other) + : internal::OptionalBase<T>(std::move(other)) {} + + template <class... Args> + constexpr explicit Optional(in_place_t, Args&&... args) + : internal::OptionalBase<T>(in_place, std::forward<Args>(args)...) {} + + template < + class U, + class... Args, + class = std::enable_if_t<std::is_constructible<value_type, + std::initializer_list<U>&, + Args...>::value>> + constexpr explicit Optional(in_place_t, + std::initializer_list<U> il, + Args&&... args) + : internal::OptionalBase<T>(in_place, il, std::forward<Args>(args)...) {} + + // Forward value constructor. Similar to converting constructors, + // conditionally explicit. + template < + typename U = value_type, + std::enable_if_t< + std::is_constructible<T, U&&>::value && + !std::is_same<internal::RemoveCvRefT<U>, in_place_t>::value && + !std::is_same<internal::RemoveCvRefT<U>, Optional<T>>::value && + std::is_convertible<U&&, T>::value, + bool> = false> + constexpr Optional(U&& value) + : internal::OptionalBase<T>(in_place, std::forward<U>(value)) {} + + template < + typename U = value_type, + std::enable_if_t< + std::is_constructible<T, U&&>::value && + !std::is_same<internal::RemoveCvRefT<U>, in_place_t>::value && + !std::is_same<internal::RemoveCvRefT<U>, Optional<T>>::value && + !std::is_convertible<U&&, T>::value, + bool> = false> + constexpr explicit Optional(U&& value) + : internal::OptionalBase<T>(in_place, std::forward<U>(value)) {} + + ~Optional() = default; + + // Defer copy-/move- assign operator implementation to OptionalBase. + Optional& operator=(const Optional& other) = default; + Optional& operator=(Optional&& other) noexcept( + std::is_nothrow_move_assignable<T>::value&& + std::is_nothrow_move_constructible<T>::value) = default; + + Optional& operator=(nullopt_t) { + FreeIfNeeded(); + return *this; + } + + // Perfect-forwarded assignment. + template <typename U> + std::enable_if_t< + !std::is_same<internal::RemoveCvRefT<U>, Optional<T>>::value && + std::is_constructible<T, U>::value && + std::is_assignable<T&, U>::value && + (!std::is_scalar<T>::value || + !std::is_same<std::decay_t<U>, T>::value), + Optional&> + operator=(U&& value) { + InitOrAssign(std::forward<U>(value)); + return *this; + } + + // Copy assign the state of other. + template <typename U> + std::enable_if_t<!internal::IsAssignableFromOptional<T, U>::value && + std::is_constructible<T, const U&>::value && + std::is_assignable<T&, const U&>::value, + Optional&> + operator=(const Optional<U>& other) { + CopyAssign(other); + return *this; + } + + // Move assign the state of other. + template <typename U> + std::enable_if_t<!internal::IsAssignableFromOptional<T, U>::value && + std::is_constructible<T, U>::value && + std::is_assignable<T&, U>::value, + Optional&> + operator=(Optional<U>&& other) { + MoveAssign(std::move(other)); + return *this; + } + + constexpr const T* operator->() const { + GURL_CHECK(storage_.is_populated_); + return &storage_.value_; + } + + constexpr T* operator->() { + GURL_CHECK(storage_.is_populated_); + return &storage_.value_; + } + + constexpr const T& operator*() const & { + GURL_CHECK(storage_.is_populated_); + return storage_.value_; + } + + constexpr T& operator*() & { + GURL_CHECK(storage_.is_populated_); + return storage_.value_; + } + + constexpr const T&& operator*() const && { + GURL_CHECK(storage_.is_populated_); + return std::move(storage_.value_); + } + + constexpr T&& operator*() && { + GURL_CHECK(storage_.is_populated_); + return std::move(storage_.value_); + } + + constexpr explicit operator bool() const { return storage_.is_populated_; } + + constexpr bool has_value() const { return storage_.is_populated_; } + + constexpr T& value() & { + GURL_CHECK(storage_.is_populated_); + return storage_.value_; + } + + constexpr const T& value() const & { + GURL_CHECK(storage_.is_populated_); + return storage_.value_; + } + + constexpr T&& value() && { + GURL_CHECK(storage_.is_populated_); + return std::move(storage_.value_); + } + + constexpr const T&& value() const && { + GURL_CHECK(storage_.is_populated_); + return std::move(storage_.value_); + } + + template <class U> + constexpr T value_or(U&& default_value) const& { + // TODO(mlamouri): add the following assert when possible: + // static_assert(std::is_copy_constructible<T>::value, + // "T must be copy constructible"); + static_assert(std::is_convertible<U, T>::value, + "U must be convertible to T"); + return storage_.is_populated_ + ? storage_.value_ + : static_cast<T>(std::forward<U>(default_value)); + } + + template <class U> + constexpr T value_or(U&& default_value) && { + // TODO(mlamouri): add the following assert when possible: + // static_assert(std::is_move_constructible<T>::value, + // "T must be move constructible"); + static_assert(std::is_convertible<U, T>::value, + "U must be convertible to T"); + return storage_.is_populated_ + ? std::move(storage_.value_) + : static_cast<T>(std::forward<U>(default_value)); + } + + void swap(Optional& other) { + if (!storage_.is_populated_ && !other.storage_.is_populated_) + return; + + if (storage_.is_populated_ != other.storage_.is_populated_) { + if (storage_.is_populated_) { + other.storage_.Init(std::move(storage_.value_)); + FreeIfNeeded(); + } else { + storage_.Init(std::move(other.storage_.value_)); + other.FreeIfNeeded(); + } + return; + } + + GURL_DCHECK(storage_.is_populated_ && other.storage_.is_populated_); + using std::swap; + swap(**this, *other); + } + + void reset() { FreeIfNeeded(); } + + template <class... Args> + T& emplace(Args&&... args) { + FreeIfNeeded(); + storage_.Init(std::forward<Args>(args)...); + return storage_.value_; + } + + template <class U, class... Args> + std::enable_if_t< + std::is_constructible<T, std::initializer_list<U>&, Args&&...>::value, + T&> + emplace(std::initializer_list<U> il, Args&&... args) { + FreeIfNeeded(); + storage_.Init(il, std::forward<Args>(args)...); + return storage_.value_; + } + + private: + // Accessing template base class's protected member needs explicit + // declaration to do so. + using internal::OptionalBase<T>::CopyAssign; + using internal::OptionalBase<T>::FreeIfNeeded; + using internal::OptionalBase<T>::InitOrAssign; + using internal::OptionalBase<T>::MoveAssign; + using internal::OptionalBase<T>::storage_; +}; + +// Here after defines comparation operators. The definition follows +// http://en.cppreference.com/w/cpp/utility/optional/operator_cmp +// while bool() casting is replaced by has_value() to meet the chromium +// style guide. +template <class T, class U> +constexpr bool operator==(const Optional<T>& lhs, const Optional<U>& rhs) { + if (lhs.has_value() != rhs.has_value()) + return false; + if (!lhs.has_value()) + return true; + return *lhs == *rhs; +} + +template <class T, class U> +constexpr bool operator!=(const Optional<T>& lhs, const Optional<U>& rhs) { + if (lhs.has_value() != rhs.has_value()) + return true; + if (!lhs.has_value()) + return false; + return *lhs != *rhs; +} + +template <class T, class U> +constexpr bool operator<(const Optional<T>& lhs, const Optional<U>& rhs) { + if (!rhs.has_value()) + return false; + if (!lhs.has_value()) + return true; + return *lhs < *rhs; +} + +template <class T, class U> +constexpr bool operator<=(const Optional<T>& lhs, const Optional<U>& rhs) { + if (!lhs.has_value()) + return true; + if (!rhs.has_value()) + return false; + return *lhs <= *rhs; +} + +template <class T, class U> +constexpr bool operator>(const Optional<T>& lhs, const Optional<U>& rhs) { + if (!lhs.has_value()) + return false; + if (!rhs.has_value()) + return true; + return *lhs > *rhs; +} + +template <class T, class U> +constexpr bool operator>=(const Optional<T>& lhs, const Optional<U>& rhs) { + if (!rhs.has_value()) + return true; + if (!lhs.has_value()) + return false; + return *lhs >= *rhs; +} + +template <class T> +constexpr bool operator==(const Optional<T>& opt, nullopt_t) { + return !opt; +} + +template <class T> +constexpr bool operator==(nullopt_t, const Optional<T>& opt) { + return !opt; +} + +template <class T> +constexpr bool operator!=(const Optional<T>& opt, nullopt_t) { + return opt.has_value(); +} + +template <class T> +constexpr bool operator!=(nullopt_t, const Optional<T>& opt) { + return opt.has_value(); +} + +template <class T> +constexpr bool operator<(const Optional<T>& opt, nullopt_t) { + return false; +} + +template <class T> +constexpr bool operator<(nullopt_t, const Optional<T>& opt) { + return opt.has_value(); +} + +template <class T> +constexpr bool operator<=(const Optional<T>& opt, nullopt_t) { + return !opt; +} + +template <class T> +constexpr bool operator<=(nullopt_t, const Optional<T>& opt) { + return true; +} + +template <class T> +constexpr bool operator>(const Optional<T>& opt, nullopt_t) { + return opt.has_value(); +} + +template <class T> +constexpr bool operator>(nullopt_t, const Optional<T>& opt) { + return false; +} + +template <class T> +constexpr bool operator>=(const Optional<T>& opt, nullopt_t) { + return true; +} + +template <class T> +constexpr bool operator>=(nullopt_t, const Optional<T>& opt) { + return !opt; +} + +template <class T, class U> +constexpr bool operator==(const Optional<T>& opt, const U& value) { + return opt.has_value() ? *opt == value : false; +} + +template <class T, class U> +constexpr bool operator==(const U& value, const Optional<T>& opt) { + return opt.has_value() ? value == *opt : false; +} + +template <class T, class U> +constexpr bool operator!=(const Optional<T>& opt, const U& value) { + return opt.has_value() ? *opt != value : true; +} + +template <class T, class U> +constexpr bool operator!=(const U& value, const Optional<T>& opt) { + return opt.has_value() ? value != *opt : true; +} + +template <class T, class U> +constexpr bool operator<(const Optional<T>& opt, const U& value) { + return opt.has_value() ? *opt < value : true; +} + +template <class T, class U> +constexpr bool operator<(const U& value, const Optional<T>& opt) { + return opt.has_value() ? value < *opt : false; +} + +template <class T, class U> +constexpr bool operator<=(const Optional<T>& opt, const U& value) { + return opt.has_value() ? *opt <= value : true; +} + +template <class T, class U> +constexpr bool operator<=(const U& value, const Optional<T>& opt) { + return opt.has_value() ? value <= *opt : false; +} + +template <class T, class U> +constexpr bool operator>(const Optional<T>& opt, const U& value) { + return opt.has_value() ? *opt > value : false; +} + +template <class T, class U> +constexpr bool operator>(const U& value, const Optional<T>& opt) { + return opt.has_value() ? value > *opt : true; +} + +template <class T, class U> +constexpr bool operator>=(const Optional<T>& opt, const U& value) { + return opt.has_value() ? *opt >= value : false; +} + +template <class T, class U> +constexpr bool operator>=(const U& value, const Optional<T>& opt) { + return opt.has_value() ? value >= *opt : true; +} + +template <class T> +constexpr Optional<std::decay_t<T>> make_optional(T&& value) { + return Optional<std::decay_t<T>>(std::forward<T>(value)); +} + +template <class T, class... Args> +constexpr Optional<T> make_optional(Args&&... args) { + return Optional<T>(in_place, std::forward<Args>(args)...); +} + +template <class T, class U, class... Args> +constexpr Optional<T> make_optional(std::initializer_list<U> il, + Args&&... args) { + return Optional<T>(in_place, il, std::forward<Args>(args)...); +} + +// Partial specialization for a function template is not allowed. Also, it is +// not allowed to add overload function to std namespace, while it is allowed +// to specialize the template in std. Thus, swap() (kind of) overloading is +// defined in base namespace, instead. +template <class T> +std::enable_if_t<std::is_move_constructible<T>::value && + internal::IsSwappable<T>::value> +swap(Optional<T>& lhs, Optional<T>& rhs) { + lhs.swap(rhs); +} + +} // namespace base + +namespace std { + +template <class T> +struct hash<gurl_base::Optional<T>> { + size_t operator()(const gurl_base::Optional<T>& opt) const { + return opt == gurl_base::nullopt ? 0 : std::hash<T>()(*opt); + } +}; + +} // namespace std + +#endif // BASE_OPTIONAL_H_
diff --git a/base/stl_util.h b/base/stl_util.h new file mode 100644 index 0000000..d6ca464 --- /dev/null +++ b/base/stl_util.h
@@ -0,0 +1,657 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Derived from google3/util/gtl/stl_util.h + +#ifndef BASE_STL_UTIL_H_ +#define BASE_STL_UTIL_H_ + +#include <algorithm> +#include <deque> +#include <forward_list> +#include <functional> +#include <initializer_list> +#include <iterator> +#include <list> +#include <map> +#include <set> +#include <string> +#include <type_traits> +#include <unordered_map> +#include <unordered_set> +#include <utility> +#include <vector> + +#include "polyfills/base/logging.h" +#include "base/optional.h" +#include "base/template_util.h" + +namespace gurl_base { + +namespace internal { + +// Calls erase on iterators of matching elements. +template <typename Container, typename Predicate> +void IterateAndEraseIf(Container& container, Predicate pred) { + for (auto it = container.begin(); it != container.end();) { + if (pred(*it)) + it = container.erase(it); + else + ++it; + } +} + +template <typename Iter> +constexpr bool IsRandomAccessIter = + std::is_same<typename std::iterator_traits<Iter>::iterator_category, + std::random_access_iterator_tag>::value; + +// Utility type traits used for specializing gurl_base::Contains() below. +template <typename Container, typename Element, typename = void> +struct HasFindWithNpos : std::false_type {}; + +template <typename Container, typename Element> +struct HasFindWithNpos< + Container, + Element, + void_t<decltype(std::declval<const Container&>().find( + std::declval<const Element&>()) != Container::npos)>> + : std::true_type {}; + +template <typename Container, typename Element, typename = void> +struct HasFindWithEnd : std::false_type {}; + +template <typename Container, typename Element> +struct HasFindWithEnd<Container, + Element, + void_t<decltype(std::declval<const Container&>().find( + std::declval<const Element&>()) != + std::declval<const Container&>().end())>> + : std::true_type {}; + +template <typename Container, typename Element, typename = void> +struct HasContains : std::false_type {}; + +template <typename Container, typename Element> +struct HasContains<Container, + Element, + void_t<decltype(std::declval<const Container&>().contains( + std::declval<const Element&>()))>> : std::true_type {}; + +} // namespace internal + +// C++14 implementation of C++17's std::size(): +// http://en.cppreference.com/w/cpp/iterator/size +template <typename Container> +constexpr auto size(const Container& c) -> decltype(c.size()) { + return c.size(); +} + +template <typename T, size_t N> +constexpr size_t size(const T (&array)[N]) noexcept { + return N; +} + +// C++14 implementation of C++17's std::empty(): +// http://en.cppreference.com/w/cpp/iterator/empty +template <typename Container> +constexpr auto empty(const Container& c) -> decltype(c.empty()) { + return c.empty(); +} + +template <typename T, size_t N> +constexpr bool empty(const T (&array)[N]) noexcept { + return false; +} + +template <typename T> +constexpr bool empty(std::initializer_list<T> il) noexcept { + return il.size() == 0; +} + +// C++14 implementation of C++17's std::data(): +// http://en.cppreference.com/w/cpp/iterator/data +template <typename Container> +constexpr auto data(Container& c) -> decltype(c.data()) { + return c.data(); +} + +// std::basic_string::data() had no mutable overload prior to C++17 [1]. +// Hence this overload is provided. +// Note: str[0] is safe even for empty strings, as they are guaranteed to be +// null-terminated [2]. +// +// [1] http://en.cppreference.com/w/cpp/string/basic_string/data +// [2] http://en.cppreference.com/w/cpp/string/basic_string/operator_at +template <typename CharT, typename Traits, typename Allocator> +CharT* data(std::basic_string<CharT, Traits, Allocator>& str) { + return std::addressof(str[0]); +} + +template <typename Container> +constexpr auto data(const Container& c) -> decltype(c.data()) { + return c.data(); +} + +template <typename T, size_t N> +constexpr T* data(T (&array)[N]) noexcept { + return array; +} + +template <typename T> +constexpr const T* data(std::initializer_list<T> il) noexcept { + return il.begin(); +} + +// Returns a const reference to the underlying container of a container adapter. +// Works for std::priority_queue, std::queue, and std::stack. +template <class A> +const typename A::container_type& GetUnderlyingContainer(const A& adapter) { + struct ExposedAdapter : A { + using A::c; + }; + return adapter.*&ExposedAdapter::c; +} + +// Clears internal memory of an STL object. +// STL clear()/reserve(0) does not always free internal memory allocated +// This function uses swap/destructor to ensure the internal memory is freed. +template<class T> +void STLClearObject(T* obj) { + T tmp; + tmp.swap(*obj); + // Sometimes "T tmp" allocates objects with memory (arena implementation?). + // Hence using additional reserve(0) even if it doesn't always work. + obj->reserve(0); +} + +// Counts the number of instances of val in a container. +template <typename Container, typename T> +typename std::iterator_traits< + typename Container::const_iterator>::difference_type +STLCount(const Container& container, const T& val) { + return std::count(container.begin(), container.end(), val); +} + +// General purpose implementation to check if |container| contains |value|. +template <typename Container, + typename Value, + std::enable_if_t< + !internal::HasFindWithNpos<Container, Value>::value && + !internal::HasFindWithEnd<Container, Value>::value && + !internal::HasContains<Container, Value>::value>* = nullptr> +bool Contains(const Container& container, const Value& value) { + using std::begin; + using std::end; + return std::find(begin(container), end(container), value) != end(container); +} + +// Specialized Contains() implementation for when |container| has a find() +// member function and a static npos member, but no contains() member function. +template <typename Container, + typename Value, + std::enable_if_t<internal::HasFindWithNpos<Container, Value>::value && + !internal::HasContains<Container, Value>::value>* = + nullptr> +bool Contains(const Container& container, const Value& value) { + return container.find(value) != Container::npos; +} + +// Specialized Contains() implementation for when |container| has a find() +// and end() member function, but no contains() member function. +template <typename Container, + typename Value, + std::enable_if_t<internal::HasFindWithEnd<Container, Value>::value && + !internal::HasContains<Container, Value>::value>* = + nullptr> +bool Contains(const Container& container, const Value& value) { + return container.find(value) != container.end(); +} + +// Specialized Contains() implementation for when |container| has a contains() +// member function. +template < + typename Container, + typename Value, + std::enable_if_t<internal::HasContains<Container, Value>::value>* = nullptr> +bool Contains(const Container& container, const Value& value) { + return container.contains(value); +} + +// O(1) implementation of const casting an iterator for any sequence, +// associative or unordered associative container in the STL. +// +// Reference: https://stackoverflow.com/a/10669041 +template <typename Container, + typename ConstIter, + std::enable_if_t<!internal::IsRandomAccessIter<ConstIter>>* = nullptr> +constexpr auto ConstCastIterator(Container& c, ConstIter it) { + return c.erase(it, it); +} + +// Explicit overload for std::forward_list where erase() is named erase_after(). +template <typename T, typename Allocator> +constexpr auto ConstCastIterator( + std::forward_list<T, Allocator>& c, + typename std::forward_list<T, Allocator>::const_iterator it) { +// The erase_after(it, it) trick used below does not work for libstdc++ [1], +// thus we need a different way. +// TODO(crbug.com/972541): Remove this workaround once libstdc++ is fixed on all +// platforms. +// +// [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90857 +#if defined(__GLIBCXX__) + return c.insert_after(it, {}); +#else + return c.erase_after(it, it); +#endif +} + +// Specialized O(1) const casting for random access iterators. This is +// necessary, because erase() is either not available (e.g. array-like +// containers), or has O(n) complexity (e.g. std::deque or std::vector). +template <typename Container, + typename ConstIter, + std::enable_if_t<internal::IsRandomAccessIter<ConstIter>>* = nullptr> +constexpr auto ConstCastIterator(Container& c, ConstIter it) { + using std::begin; + using std::cbegin; + return begin(c) + (it - cbegin(c)); +} + +namespace internal { + +template <typename Map, typename Key, typename Value> +std::pair<typename Map::iterator, bool> InsertOrAssignImpl(Map& map, + Key&& key, + Value&& value) { + auto lower = map.lower_bound(key); + if (lower != map.end() && !map.key_comp()(key, lower->first)) { + // key already exists, perform assignment. + lower->second = std::forward<Value>(value); + return {lower, false}; + } + + // key did not yet exist, insert it. + return {map.emplace_hint(lower, std::forward<Key>(key), + std::forward<Value>(value)), + true}; +} + +template <typename Map, typename Key, typename Value> +typename Map::iterator InsertOrAssignImpl(Map& map, + typename Map::const_iterator hint, + Key&& key, + Value&& value) { + auto&& key_comp = map.key_comp(); + if ((hint == map.begin() || key_comp(std::prev(hint)->first, key))) { + if (hint == map.end() || key_comp(key, hint->first)) { + // *(hint - 1) < key < *hint => key did not exist and hint is correct. + return map.emplace_hint(hint, std::forward<Key>(key), + std::forward<Value>(value)); + } + + if (!key_comp(hint->first, key)) { + // key == *hint => key already exists and hint is correct. + auto mutable_hint = ConstCastIterator(map, hint); + mutable_hint->second = std::forward<Value>(value); + return mutable_hint; + } + } + + // hint was not helpful, dispatch to hintless version. + return InsertOrAssignImpl(map, std::forward<Key>(key), + std::forward<Value>(value)) + .first; +} + +template <typename Map, typename Key, typename... Args> +std::pair<typename Map::iterator, bool> TryEmplaceImpl(Map& map, + Key&& key, + Args&&... args) { + auto lower = map.lower_bound(key); + if (lower != map.end() && !map.key_comp()(key, lower->first)) { + // key already exists, do nothing. + return {lower, false}; + } + + // key did not yet exist, insert it. + return {map.emplace_hint(lower, std::piecewise_construct, + std::forward_as_tuple(std::forward<Key>(key)), + std::forward_as_tuple(std::forward<Args>(args)...)), + true}; +} + +template <typename Map, typename Key, typename... Args> +typename Map::iterator TryEmplaceImpl(Map& map, + typename Map::const_iterator hint, + Key&& key, + Args&&... args) { + auto&& key_comp = map.key_comp(); + if ((hint == map.begin() || key_comp(std::prev(hint)->first, key))) { + if (hint == map.end() || key_comp(key, hint->first)) { + // *(hint - 1) < key < *hint => key did not exist and hint is correct. + return map.emplace_hint( + hint, std::piecewise_construct, + std::forward_as_tuple(std::forward<Key>(key)), + std::forward_as_tuple(std::forward<Args>(args)...)); + } + + if (!key_comp(hint->first, key)) { + // key == *hint => no-op, return correct hint. + return ConstCastIterator(map, hint); + } + } + + // hint was not helpful, dispatch to hintless version. + return TryEmplaceImpl(map, std::forward<Key>(key), + std::forward<Args>(args)...) + .first; +} + +} // namespace internal + +// Implementation of C++17's std::map::insert_or_assign as a free function. +template <typename Map, typename Value> +std::pair<typename Map::iterator, bool> +InsertOrAssign(Map& map, const typename Map::key_type& key, Value&& value) { + return internal::InsertOrAssignImpl(map, key, std::forward<Value>(value)); +} + +template <typename Map, typename Value> +std::pair<typename Map::iterator, bool> +InsertOrAssign(Map& map, typename Map::key_type&& key, Value&& value) { + return internal::InsertOrAssignImpl(map, std::move(key), + std::forward<Value>(value)); +} + +// Implementation of C++17's std::map::insert_or_assign with hint as a free +// function. +template <typename Map, typename Value> +typename Map::iterator InsertOrAssign(Map& map, + typename Map::const_iterator hint, + const typename Map::key_type& key, + Value&& value) { + return internal::InsertOrAssignImpl(map, hint, key, + std::forward<Value>(value)); +} + +template <typename Map, typename Value> +typename Map::iterator InsertOrAssign(Map& map, + typename Map::const_iterator hint, + typename Map::key_type&& key, + Value&& value) { + return internal::InsertOrAssignImpl(map, hint, std::move(key), + std::forward<Value>(value)); +} + +// Implementation of C++17's std::map::try_emplace as a free function. +template <typename Map, typename... Args> +std::pair<typename Map::iterator, bool> +TryEmplace(Map& map, const typename Map::key_type& key, Args&&... args) { + return internal::TryEmplaceImpl(map, key, std::forward<Args>(args)...); +} + +template <typename Map, typename... Args> +std::pair<typename Map::iterator, bool> TryEmplace(Map& map, + typename Map::key_type&& key, + Args&&... args) { + return internal::TryEmplaceImpl(map, std::move(key), + std::forward<Args>(args)...); +} + +// Implementation of C++17's std::map::try_emplace with hint as a free +// function. +template <typename Map, typename... Args> +typename Map::iterator TryEmplace(Map& map, + typename Map::const_iterator hint, + const typename Map::key_type& key, + Args&&... args) { + return internal::TryEmplaceImpl(map, hint, key, std::forward<Args>(args)...); +} + +template <typename Map, typename... Args> +typename Map::iterator TryEmplace(Map& map, + typename Map::const_iterator hint, + typename Map::key_type&& key, + Args&&... args) { + return internal::TryEmplaceImpl(map, hint, std::move(key), + std::forward<Args>(args)...); +} + +// Returns true if the container is sorted. +template <typename Container> +bool STLIsSorted(const Container& cont) { + return std::is_sorted(std::begin(cont), std::end(cont)); +} + +// Returns a new ResultType containing the difference of two sorted containers. +template <typename ResultType, typename Arg1, typename Arg2> +ResultType STLSetDifference(const Arg1& a1, const Arg2& a2) { + GURL_DCHECK(STLIsSorted(a1)); + GURL_DCHECK(STLIsSorted(a2)); + ResultType difference; + std::set_difference(a1.begin(), a1.end(), + a2.begin(), a2.end(), + std::inserter(difference, difference.end())); + return difference; +} + +// Returns a new ResultType containing the union of two sorted containers. +template <typename ResultType, typename Arg1, typename Arg2> +ResultType STLSetUnion(const Arg1& a1, const Arg2& a2) { + GURL_DCHECK(STLIsSorted(a1)); + GURL_DCHECK(STLIsSorted(a2)); + ResultType result; + std::set_union(a1.begin(), a1.end(), + a2.begin(), a2.end(), + std::inserter(result, result.end())); + return result; +} + +// Returns a new ResultType containing the intersection of two sorted +// containers. +template <typename ResultType, typename Arg1, typename Arg2> +ResultType STLSetIntersection(const Arg1& a1, const Arg2& a2) { + GURL_DCHECK(STLIsSorted(a1)); + GURL_DCHECK(STLIsSorted(a2)); + ResultType result; + std::set_intersection(a1.begin(), a1.end(), + a2.begin(), a2.end(), + std::inserter(result, result.end())); + return result; +} + +// Returns true if the sorted container |a1| contains all elements of the sorted +// container |a2|. +template <typename Arg1, typename Arg2> +bool STLIncludes(const Arg1& a1, const Arg2& a2) { + GURL_DCHECK(STLIsSorted(a1)); + GURL_DCHECK(STLIsSorted(a2)); + return std::includes(a1.begin(), a1.end(), + a2.begin(), a2.end()); +} + +// Erase/EraseIf are based on library fundamentals ts v2 erase/erase_if +// http://en.cppreference.com/w/cpp/experimental/lib_extensions_2 +// They provide a generic way to erase elements from a container. +// The functions here implement these for the standard containers until those +// functions are available in the C++ standard. +// For Chromium containers overloads should be defined in their own headers +// (like standard containers). +// Note: there is no std::erase for standard associative containers so we don't +// have it either. + +template <typename CharT, typename Traits, typename Allocator, typename Value> +void Erase(std::basic_string<CharT, Traits, Allocator>& container, + const Value& value) { + container.erase(std::remove(container.begin(), container.end(), value), + container.end()); +} + +template <typename CharT, typename Traits, typename Allocator, class Predicate> +void EraseIf(std::basic_string<CharT, Traits, Allocator>& container, + Predicate pred) { + container.erase(std::remove_if(container.begin(), container.end(), pred), + container.end()); +} + +template <class T, class Allocator, class Value> +void Erase(std::deque<T, Allocator>& container, const Value& value) { + container.erase(std::remove(container.begin(), container.end(), value), + container.end()); +} + +template <class T, class Allocator, class Predicate> +void EraseIf(std::deque<T, Allocator>& container, Predicate pred) { + container.erase(std::remove_if(container.begin(), container.end(), pred), + container.end()); +} + +template <class T, class Allocator, class Value> +void Erase(std::vector<T, Allocator>& container, const Value& value) { + container.erase(std::remove(container.begin(), container.end(), value), + container.end()); +} + +template <class T, class Allocator, class Predicate> +void EraseIf(std::vector<T, Allocator>& container, Predicate pred) { + container.erase(std::remove_if(container.begin(), container.end(), pred), + container.end()); +} + +template <class T, class Allocator, class Value> +void Erase(std::forward_list<T, Allocator>& container, const Value& value) { + // Unlike std::forward_list::remove, this function template accepts + // heterogeneous types and does not force a conversion to the container's + // value type before invoking the == operator. + container.remove_if([&](const T& cur) { return cur == value; }); +} + +template <class T, class Allocator, class Predicate> +void EraseIf(std::forward_list<T, Allocator>& container, Predicate pred) { + container.remove_if(pred); +} + +template <class T, class Allocator, class Value> +void Erase(std::list<T, Allocator>& container, const Value& value) { + // Unlike std::list::remove, this function template accepts heterogeneous + // types and does not force a conversion to the container's value type before + // invoking the == operator. + container.remove_if([&](const T& cur) { return cur == value; }); +} + +template <class T, class Allocator, class Predicate> +void EraseIf(std::list<T, Allocator>& container, Predicate pred) { + container.remove_if(pred); +} + +template <class Key, class T, class Compare, class Allocator, class Predicate> +void EraseIf(std::map<Key, T, Compare, Allocator>& container, Predicate pred) { + internal::IterateAndEraseIf(container, pred); +} + +template <class Key, class T, class Compare, class Allocator, class Predicate> +void EraseIf(std::multimap<Key, T, Compare, Allocator>& container, + Predicate pred) { + internal::IterateAndEraseIf(container, pred); +} + +template <class Key, class Compare, class Allocator, class Predicate> +void EraseIf(std::set<Key, Compare, Allocator>& container, Predicate pred) { + internal::IterateAndEraseIf(container, pred); +} + +template <class Key, class Compare, class Allocator, class Predicate> +void EraseIf(std::multiset<Key, Compare, Allocator>& container, + Predicate pred) { + internal::IterateAndEraseIf(container, pred); +} + +template <class Key, + class T, + class Hash, + class KeyEqual, + class Allocator, + class Predicate> +void EraseIf(std::unordered_map<Key, T, Hash, KeyEqual, Allocator>& container, + Predicate pred) { + internal::IterateAndEraseIf(container, pred); +} + +template <class Key, + class T, + class Hash, + class KeyEqual, + class Allocator, + class Predicate> +void EraseIf( + std::unordered_multimap<Key, T, Hash, KeyEqual, Allocator>& container, + Predicate pred) { + internal::IterateAndEraseIf(container, pred); +} + +template <class Key, + class Hash, + class KeyEqual, + class Allocator, + class Predicate> +void EraseIf(std::unordered_set<Key, Hash, KeyEqual, Allocator>& container, + Predicate pred) { + internal::IterateAndEraseIf(container, pred); +} + +template <class Key, + class Hash, + class KeyEqual, + class Allocator, + class Predicate> +void EraseIf(std::unordered_multiset<Key, Hash, KeyEqual, Allocator>& container, + Predicate pred) { + internal::IterateAndEraseIf(container, pred); +} + +// A helper class to be used as the predicate with |EraseIf| to implement +// in-place set intersection. Helps implement the algorithm of going through +// each container an element at a time, erasing elements from the first +// container if they aren't in the second container. Requires each container be +// sorted. Note that the logic below appears inverted since it is returning +// whether an element should be erased. +template <class Collection> +class IsNotIn { + public: + explicit IsNotIn(const Collection& collection) + : i_(collection.begin()), end_(collection.end()) {} + + bool operator()(const typename Collection::value_type& x) { + while (i_ != end_ && *i_ < x) + ++i_; + if (i_ == end_) + return true; + if (*i_ == x) { + ++i_; + return false; + } + return true; + } + + private: + typename Collection::const_iterator i_; + const typename Collection::const_iterator end_; +}; + +// Helper for returning the optional value's address, or nullptr. +template <class T> +T* OptionalOrNullptr(gurl_base::Optional<T>& optional) { + return optional.has_value() ? &optional.value() : nullptr; +} + +template <class T> +const T* OptionalOrNullptr(const gurl_base::Optional<T>& optional) { + return optional.has_value() ? &optional.value() : nullptr; +} + +} // namespace base + +#endif // BASE_STL_UTIL_H_
diff --git a/base/strings/BUILD b/base/strings/BUILD new file mode 100644 index 0000000..60aebcf --- /dev/null +++ b/base/strings/BUILD
@@ -0,0 +1,32 @@ +# Copyright 2019 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +cc_library( + name = "strings", + srcs = [ + "string16.cc", + "string_piece.cc", + "string_util.cc", + "string_util_constants.cc", + "utf_string_conversion_utils.cc", + "utf_string_conversions.cc", + ], + hdrs = [ + "char_traits.h", + "string16.h", + "string_piece.h", + "string_piece_forward.h", + "string_util.h", + "string_util_posix.h", + "utf_string_conversion_utils.h", + "utf_string_conversions.h", + ], + visibility = ["//visibility:public"], + deps = [ + "//base", + "//base/third_party/icu", + "//build:build_config", + "//polyfills", + ], +)
diff --git a/base/strings/char_traits.h b/base/strings/char_traits.h new file mode 100644 index 0000000..0fe9f26 --- /dev/null +++ b/base/strings/char_traits.h
@@ -0,0 +1,92 @@ +// Copyright 2018 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_CHAR_TRAITS_H_ +#define BASE_STRINGS_CHAR_TRAITS_H_ + +#include <stddef.h> + +#include "base/compiler_specific.h" + +namespace gurl_base { + +// constexpr version of http://en.cppreference.com/w/cpp/string/char_traits. +// This currently just implements the bits needed to support a (mostly) +// constexpr StringPiece. +// +// TODO(dcheng): Once we switch to C++17, most methods will become constexpr and +// we can switch over to using the one in the standard library. +template <typename T> +struct CharTraits { + // Performs a lexographical comparison of the first N characters of |s1| and + // |s2|. Returns 0 if equal, -1 if |s1| is less than |s2|, and 1 if |s1| is + // greater than |s2|. + static constexpr int compare(const T* s1, const T* s2, size_t n) noexcept; + + // Returns the length of |s|, assuming null termination (and not including the + // terminating null). + static constexpr size_t length(const T* s) noexcept; +}; + +template <typename T> +constexpr int CharTraits<T>::compare(const T* s1, + const T* s2, + size_t n) noexcept { + for (; n; --n, ++s1, ++s2) { + if (*s1 < *s2) + return -1; + if (*s1 > *s2) + return 1; + } + return 0; +} + +template <typename T> +constexpr size_t CharTraits<T>::length(const T* s) noexcept { + size_t i = 0; + for (; *s; ++s) + ++i; + return i; +} + +// char specialization of CharTraits that can use clang's constexpr instrinsics, +// where available. +template <> +struct CharTraits<char> { + static constexpr int compare(const char* s1, + const char* s2, + size_t n) noexcept; + static constexpr size_t length(const char* s) noexcept; +}; + +constexpr int CharTraits<char>::compare(const char* s1, + const char* s2, + size_t n) noexcept { +#if HAS_FEATURE(cxx_constexpr_string_builtins) + return __builtin_memcmp(s1, s2, n); +#else + for (; n; --n, ++s1, ++s2) { + if (*s1 < *s2) + return -1; + if (*s1 > *s2) + return 1; + } + return 0; +#endif +} + +constexpr size_t CharTraits<char>::length(const char* s) noexcept { +#if defined(__clang__) + return __builtin_strlen(s); +#else + size_t i = 0; + for (; *s; ++s) + ++i; + return i; +#endif +} + +} // namespace base + +#endif // BASE_STRINGS_CHAR_TRAITS_H_
diff --git a/base/strings/char_traits_unittest.cc b/base/strings/char_traits_unittest.cc new file mode 100644 index 0000000..d0fdc07 --- /dev/null +++ b/base/strings/char_traits_unittest.cc
@@ -0,0 +1,32 @@ +// Copyright 2018 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/char_traits.h" +#include "base/strings/string16.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace gurl_base { + +TEST(CharTraitsTest, CharCompare) { + static_assert(CharTraits<char>::compare("abc", "def", 3) == -1, ""); + static_assert(CharTraits<char>::compare("def", "def", 3) == 0, ""); + static_assert(CharTraits<char>::compare("ghi", "def", 3) == 1, ""); +} + +TEST(CharTraitsTest, CharLength) { + static_assert(CharTraits<char>::length("") == 0, ""); + static_assert(CharTraits<char>::length("abc") == 3, ""); +} + +TEST(CharTraitsTest, Char16TCompare) { + static_assert(CharTraits<char16_t>::compare(u"abc", u"def", 3) == -1, ""); + static_assert(CharTraits<char16_t>::compare(u"def", u"def", 3) == 0, ""); + static_assert(CharTraits<char16_t>::compare(u"ghi", u"def", 3) == 1, ""); +} + +TEST(CharTraitsTest, Char16TLength) { + static_assert(CharTraits<char16_t>::length(u"abc") == 3, ""); +} + +} // namespace base
diff --git a/base/strings/latin1_string_conversions.cc b/base/strings/latin1_string_conversions.cc new file mode 100644 index 0000000..5569015 --- /dev/null +++ b/base/strings/latin1_string_conversions.cc
@@ -0,0 +1,19 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/latin1_string_conversions.h" + +namespace gurl_base { + +string16 Latin1OrUTF16ToUTF16(size_t length, + const Latin1Char* latin1, + const char16* utf16) { + if (!length) + return string16(); + if (latin1) + return string16(latin1, latin1 + length); + return string16(utf16, utf16 + length); +} + +} // namespace base
diff --git a/base/strings/latin1_string_conversions.h b/base/strings/latin1_string_conversions.h new file mode 100644 index 0000000..3d60980 --- /dev/null +++ b/base/strings/latin1_string_conversions.h
@@ -0,0 +1,34 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_LATIN1_STRING_CONVERSIONS_H_ +#define BASE_STRINGS_LATIN1_STRING_CONVERSIONS_H_ + +#include <stddef.h> + +#include <string> + +#include "polyfills/base/base_export.h" +#include "base/strings/string16.h" + +namespace gurl_base { + +// This definition of Latin1Char matches the definition of LChar in Blink. We +// use unsigned char rather than char to make less tempting to mix and match +// Latin-1 and UTF-8 characters.. +typedef unsigned char Latin1Char; + +// This somewhat odd function is designed to help us convert from Blink Strings +// to string16. A Blink string is either backed by an array of Latin-1 +// characters or an array of UTF-16 characters. This function is called by +// WebString::operator string16() to convert one or the other character array +// to string16. This function is defined here rather than in WebString.h to +// avoid binary bloat in all the callers of the conversion operator. +BASE_EXPORT string16 Latin1OrUTF16ToUTF16(size_t length, + const Latin1Char* latin1, + const char16* utf16); + +} // namespace base + +#endif // BASE_STRINGS_LATIN1_STRING_CONVERSIONS_H_
diff --git a/base/strings/nullable_string16.cc b/base/strings/nullable_string16.cc new file mode 100644 index 0000000..618800d --- /dev/null +++ b/base/strings/nullable_string16.cc
@@ -0,0 +1,33 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/nullable_string16.h" + +#include <ostream> +#include <utility> + +namespace gurl_base { +NullableString16::NullableString16() = default; +NullableString16::NullableString16(const NullableString16& other) = default; +NullableString16::NullableString16(NullableString16&& other) = default; + +NullableString16::NullableString16(const string16& string, bool is_null) { + if (!is_null) + string_.emplace(string); +} + +NullableString16::NullableString16(Optional<string16> optional_string16) + : string_(std::move(optional_string16)) {} + +NullableString16::~NullableString16() = default; +NullableString16& NullableString16::operator=(const NullableString16& other) = + default; +NullableString16& NullableString16::operator=(NullableString16&& other) = + default; + +std::ostream& operator<<(std::ostream& out, const NullableString16& value) { + return value.is_null() ? out << "(null)" : out << value.string(); +} + +} // namespace base
diff --git a/base/strings/nullable_string16.h b/base/strings/nullable_string16.h new file mode 100644 index 0000000..f2ca7bd --- /dev/null +++ b/base/strings/nullable_string16.h
@@ -0,0 +1,55 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_NULLABLE_STRING16_H_ +#define BASE_STRINGS_NULLABLE_STRING16_H_ + +#include <iosfwd> + +#include "polyfills/base/base_export.h" +#include "base/optional.h" +#include "base/strings/string16.h" +#include "base/strings/string_util.h" + +namespace gurl_base { + +// This class is a simple wrapper for string16 which also contains a null +// state. This should be used only where the difference between null and +// empty is meaningful. +class BASE_EXPORT NullableString16 { + public: + NullableString16(); + NullableString16(const NullableString16& other); + NullableString16(NullableString16&& other); + NullableString16(const string16& string, bool is_null); + explicit NullableString16(Optional<string16> optional_string16); + ~NullableString16(); + + NullableString16& operator=(const NullableString16& other); + NullableString16& operator=(NullableString16&& other); + + const string16& string() const { + return string_ ? *string_ : EmptyString16(); + } + bool is_null() const { return !string_; } + const Optional<string16>& as_optional_string16() const { return string_; } + + private: + Optional<string16> string_; +}; + +inline bool operator==(const NullableString16& a, const NullableString16& b) { + return a.as_optional_string16() == b.as_optional_string16(); +} + +inline bool operator!=(const NullableString16& a, const NullableString16& b) { + return !(a == b); +} + +BASE_EXPORT std::ostream& operator<<(std::ostream& out, + const NullableString16& value); + +} // namespace base + +#endif // BASE_STRINGS_NULLABLE_STRING16_H_
diff --git a/base/strings/nullable_string16_unittest.cc b/base/strings/nullable_string16_unittest.cc new file mode 100644 index 0000000..e3d063f --- /dev/null +++ b/base/strings/nullable_string16_unittest.cc
@@ -0,0 +1,35 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/nullable_string16.h" +#include "base/strings/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace gurl_base { + +TEST(NullableString16Test, DefaultConstructor) { + NullableString16 s; + EXPECT_TRUE(s.is_null()); + EXPECT_EQ(string16(), s.string()); +} + +TEST(NullableString16Test, Equals) { + NullableString16 a(ASCIIToUTF16("hello"), false); + NullableString16 b(ASCIIToUTF16("hello"), false); + EXPECT_EQ(a, b); +} + +TEST(NullableString16Test, NotEquals) { + NullableString16 a(ASCIIToUTF16("hello"), false); + NullableString16 b(ASCIIToUTF16("world"), false); + EXPECT_NE(a, b); +} + +TEST(NullableString16Test, NotEqualsNull) { + NullableString16 a(ASCIIToUTF16("hello"), false); + NullableString16 b; + EXPECT_NE(a, b); +} + +} // namespace base
diff --git a/base/strings/pattern.cc b/base/strings/pattern.cc new file mode 100644 index 0000000..65ec075 --- /dev/null +++ b/base/strings/pattern.cc
@@ -0,0 +1,155 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/pattern.h" + +#include "base/third_party/icu/icu_utf.h" + +namespace gurl_base { + +namespace { + +constexpr bool IsWildcard(base_icu::UChar32 character) { + return character == '*' || character == '?'; +} + +// Searches for the next subpattern of |pattern| in |string|, up to the given +// |maximum_distance|. The subpattern extends from the start of |pattern| up to +// the first wildcard character (or the end of the string). If the value of +// |maximum_distance| is negative, the maximum distance is considered infinite. +template <typename CHAR, typename NEXT> +constexpr bool SearchForChars(const CHAR** pattern, + const CHAR* pattern_end, + const CHAR** string, + const CHAR* string_end, + int maximum_distance, + NEXT next) { + const CHAR* pattern_start = *pattern; + const CHAR* string_start = *string; + bool escape = false; + while (true) { + if (*pattern == pattern_end) { + // If this is the end of the pattern, only accept the end of the string; + // anything else falls through to the mismatch case. + if (*string == string_end) + return true; + } else { + // If we have found a wildcard, we're done. + if (!escape && IsWildcard(**pattern)) + return true; + + // Check if the escape character is found. If so, skip it and move to the + // next character. + if (!escape && **pattern == '\\') { + escape = true; + next(pattern, pattern_end); + continue; + } + + escape = false; + + if (*string == string_end) + return false; + + // Check if the chars match, if so, increment the ptrs. + const CHAR* pattern_next = *pattern; + const CHAR* string_next = *string; + base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end); + if (pattern_char == next(&string_next, string_end) && + pattern_char != CBU_SENTINEL) { + *pattern = pattern_next; + *string = string_next; + continue; + } + } + + // Mismatch. If we have reached the maximum distance, return false, + // otherwise restart at the beginning of the pattern with the next character + // in the string. + // TODO(bauerb): This is a naive implementation of substring search, which + // could be implemented with a more efficient algorithm, e.g. + // Knuth-Morris-Pratt (at the expense of requiring preprocessing). + if (maximum_distance == 0) + return false; + + // Because unlimited distance is represented as -1, this will never reach 0 + // and therefore fail the match above. + maximum_distance--; + *pattern = pattern_start; + next(&string_start, string_end); + *string = string_start; + } +} + +// Consumes consecutive wildcard characters (? or *). Returns the maximum number +// of characters matched by the sequence of wildcards, or -1 if the wildcards +// match an arbitrary number of characters (which is the case if it contains at +// least one *). +template <typename CHAR, typename NEXT> +constexpr int EatWildcards(const CHAR** pattern, const CHAR* end, NEXT next) { + int num_question_marks = 0; + bool has_asterisk = false; + while (*pattern != end) { + if (**pattern == '?') { + num_question_marks++; + } else if (**pattern == '*') { + has_asterisk = true; + } else { + break; + } + + next(pattern, end); + } + return has_asterisk ? -1 : num_question_marks; +} + +template <typename CHAR, typename NEXT> +constexpr bool MatchPatternT(const CHAR* eval, + const CHAR* eval_end, + const CHAR* pattern, + const CHAR* pattern_end, + NEXT next) { + do { + int maximum_wildcard_length = EatWildcards(&pattern, pattern_end, next); + if (!SearchForChars(&pattern, pattern_end, &eval, eval_end, + maximum_wildcard_length, next)) { + return false; + } + } while (pattern != pattern_end); + return true; +} + +struct NextCharUTF8 { + base_icu::UChar32 operator()(const char** p, const char* end) { + base_icu::UChar32 c; + int offset = 0; + CBU8_NEXT(*p, offset, end - *p, c); + *p += offset; + return c; + } +}; + +struct NextCharUTF16 { + base_icu::UChar32 operator()(const char16** p, const char16* end) { + base_icu::UChar32 c; + int offset = 0; + CBU16_NEXT(*p, offset, end - *p, c); + *p += offset; + return c; + } +}; + +} // namespace + +bool MatchPattern(StringPiece eval, StringPiece pattern) { + return MatchPatternT(eval.data(), eval.data() + eval.size(), pattern.data(), + pattern.data() + pattern.size(), NextCharUTF8()); +} + +bool MatchPattern(StringPiece16 eval, StringPiece16 pattern) { + return MatchPatternT(eval.data(), eval.data() + eval.size(), pattern.data(), + pattern.data() + pattern.size(), NextCharUTF16()); +} + +} // namespace base
diff --git a/base/strings/pattern.h b/base/strings/pattern.h new file mode 100644 index 0000000..3d280d0 --- /dev/null +++ b/base/strings/pattern.h
@@ -0,0 +1,23 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_PATTERN_H_ +#define BASE_STRINGS_PATTERN_H_ + +#include "polyfills/base/base_export.h" +#include "base/strings/string_piece.h" + +namespace gurl_base { + +// Returns true if the |string| passed in matches the |pattern|. The pattern +// string can contain wildcards like * and ?. +// +// The backslash character (\) is an escape character for * and ?. +// ? matches 0 or 1 character, while * matches 0 or more characters. +BASE_EXPORT bool MatchPattern(StringPiece string, StringPiece pattern); +BASE_EXPORT bool MatchPattern(StringPiece16 string, StringPiece16 pattern); + +} // namespace base + +#endif // BASE_STRINGS_PATTERN_H_
diff --git a/base/strings/pattern_unittest.cc b/base/strings/pattern_unittest.cc new file mode 100644 index 0000000..540f784 --- /dev/null +++ b/base/strings/pattern_unittest.cc
@@ -0,0 +1,52 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/pattern.h" +#include "base/strings/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace gurl_base { + +TEST(StringUtilTest, MatchPatternTest) { + EXPECT_TRUE(MatchPattern("www.google.com", "*.com")); + EXPECT_TRUE(MatchPattern("www.google.com", "*")); + EXPECT_FALSE(MatchPattern("www.google.com", "www*.g*.org")); + EXPECT_TRUE(MatchPattern("Hello", "H?l?o")); + EXPECT_FALSE(MatchPattern("www.google.com", "http://*)")); + EXPECT_FALSE(MatchPattern("www.msn.com", "*.COM")); + EXPECT_TRUE(MatchPattern("Hello*1234", "He??o\\*1*")); + EXPECT_FALSE(MatchPattern("", "*.*")); + EXPECT_TRUE(MatchPattern("", "*")); + EXPECT_TRUE(MatchPattern("", "?")); + EXPECT_TRUE(MatchPattern("", "")); + EXPECT_FALSE(MatchPattern("Hello", "")); + EXPECT_TRUE(MatchPattern("Hello*", "Hello*")); + EXPECT_TRUE(MatchPattern("abcd", "*???")); + EXPECT_FALSE(MatchPattern("abcd", "???")); + EXPECT_TRUE(MatchPattern("abcb", "a*b")); + EXPECT_FALSE(MatchPattern("abcb", "a?b")); + + // Test UTF8 matching. + EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0", "*\xe2\x99\xa0")); + EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0.", "heart: ?.")); + EXPECT_TRUE(MatchPattern("hearts: \xe2\x99\xa0\xe2\x99\xa0", "*")); + // Invalid sequences should be handled as a single invalid character. + EXPECT_TRUE(MatchPattern("invalid: \xef\xbf\xbe", "invalid: ?")); + // If the pattern has invalid characters, it shouldn't match anything. + EXPECT_FALSE(MatchPattern("\xf4\x90\x80\x80", "\xf4\x90\x80\x80")); + + // Test UTF16 character matching. + EXPECT_TRUE(MatchPattern(UTF8ToUTF16("www.google.com"), + UTF8ToUTF16("*.com"))); + EXPECT_TRUE(MatchPattern(UTF8ToUTF16("Hello*1234"), + UTF8ToUTF16("He??o\\*1*"))); + + // Some test cases that might cause naive implementations to exhibit + // exponential run time or fail. + EXPECT_TRUE(MatchPattern("Hello", "He********************************o")); + EXPECT_TRUE(MatchPattern("123456789012345678", "?????????????????*")); + EXPECT_TRUE(MatchPattern("aaaaaaaaaaab", "a*a*a*a*a*a*a*a*a*a*a*b")); +} + +} // namespace base
diff --git a/base/strings/safe_sprintf.cc b/base/strings/safe_sprintf.cc new file mode 100644 index 0000000..ab6e112 --- /dev/null +++ b/base/strings/safe_sprintf.cc
@@ -0,0 +1,682 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/safe_sprintf.h" + +#include <errno.h> +#include <string.h> + +#include <algorithm> +#include <limits> + +#include "base/macros.h" +#include "build/build_config.h" + +#if !defined(NDEBUG) +// In debug builds, we use RAW_CHECK() to print useful error messages, if +// SafeSPrintf() is called with broken arguments. +// As our contract promises that SafeSPrintf() can be called from any +// restricted run-time context, it is not actually safe to call logging +// functions from it; and we only ever do so for debug builds and hope for the +// best. We should _never_ call any logging function other than RAW_CHECK(), +// and we should _never_ include any logging code that is active in production +// builds. Most notably, we should not include these logging functions in +// unofficial release builds, even though those builds would otherwise have +// DCHECKS() enabled. +// In other words; please do not remove the #ifdef around this #include. +// Instead, in production builds we opt for returning a degraded result, +// whenever an error is encountered. +// E.g. The broken function call +// SafeSPrintf("errno = %d (%x)", errno, strerror(errno)) +// will print something like +// errno = 13, (%x) +// instead of +// errno = 13 (Access denied) +// In most of the anticipated use cases, that's probably the preferred +// behavior. +#include "polyfills/base/logging.h" +#define DEBUG_CHECK RAW_CHECK +#else +#define DEBUG_CHECK(x) do { if (x) { } } while (0) +#endif + +namespace gurl_base { +namespace strings { + +// The code in this file is extremely careful to be async-signal-safe. +// +// Most obviously, we avoid calling any code that could dynamically allocate +// memory. Doing so would almost certainly result in bugs and dead-locks. +// We also avoid calling any other STL functions that could have unintended +// side-effects involving memory allocation or access to other shared +// resources. +// +// But on top of that, we also avoid calling other library functions, as many +// of them have the side-effect of calling getenv() (in order to deal with +// localization) or accessing errno. The latter sounds benign, but there are +// several execution contexts where it isn't even possible to safely read let +// alone write errno. +// +// The stated design goal of the SafeSPrintf() function is that it can be +// called from any context that can safely call C or C++ code (i.e. anything +// that doesn't require assembly code). +// +// For a brief overview of some but not all of the issues with async-signal- +// safety, refer to: +// http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_04.html + +namespace { +const size_t kSSizeMaxConst = ((size_t)(ssize_t)-1) >> 1; + +const char kUpCaseHexDigits[] = "0123456789ABCDEF"; +const char kDownCaseHexDigits[] = "0123456789abcdef"; +} + +#if defined(NDEBUG) +// We would like to define kSSizeMax as std::numeric_limits<ssize_t>::max(), +// but C++ doesn't allow us to do that for constants. Instead, we have to +// use careful casting and shifting. We later use a static_assert to +// verify that this worked correctly. +namespace { +const size_t kSSizeMax = kSSizeMaxConst; +} +#else // defined(NDEBUG) +// For efficiency, we really need kSSizeMax to be a constant. But for unit +// tests, it should be adjustable. This allows us to verify edge cases without +// having to fill the entire available address space. As a compromise, we make +// kSSizeMax adjustable in debug builds, and then only compile that particular +// part of the unit test in debug builds. +namespace { +static size_t kSSizeMax = kSSizeMaxConst; +} + +namespace internal { +void SetSafeSPrintfSSizeMaxForTest(size_t max) { + kSSizeMax = max; +} + +size_t GetSafeSPrintfSSizeMaxForTest() { + return kSSizeMax; +} +} +#endif // defined(NDEBUG) + +namespace { +class Buffer { + public: + // |buffer| is caller-allocated storage that SafeSPrintf() writes to. It + // has |size| bytes of writable storage. It is the caller's responsibility + // to ensure that the buffer is at least one byte in size, so that it fits + // the trailing NUL that will be added by the destructor. The buffer also + // must be smaller or equal to kSSizeMax in size. + Buffer(char* buffer, size_t size) + : buffer_(buffer), + size_(size - 1), // Account for trailing NUL byte + count_(0) { +// MSVS2013's standard library doesn't mark max() as constexpr yet. cl.exe +// supports static_cast but doesn't really implement constexpr yet so it doesn't +// complain, but clang does. +#if __cplusplus >= 201103 && !(defined(__clang__) && defined(OS_WIN)) + static_assert(kSSizeMaxConst == + static_cast<size_t>(std::numeric_limits<ssize_t>::max()), + "kSSizeMaxConst should be the max value of an ssize_t"); +#endif + DEBUG_CHECK(size > 0); + DEBUG_CHECK(size <= kSSizeMax); + } + + ~Buffer() { + // The code calling the constructor guaranteed that there was enough space + // to store a trailing NUL -- and in debug builds, we are actually + // verifying this with DEBUG_CHECK()s in the constructor. So, we can + // always unconditionally write the NUL byte in the destructor. We do not + // need to adjust the count_, as SafeSPrintf() copies snprintf() in not + // including the NUL byte in its return code. + *GetInsertionPoint() = '\000'; + } + + // Returns true, iff the buffer is filled all the way to |kSSizeMax-1|. The + // caller can now stop adding more data, as GetCount() has reached its + // maximum possible value. + inline bool OutOfAddressableSpace() const { + return count_ == static_cast<size_t>(kSSizeMax - 1); + } + + // Returns the number of bytes that would have been emitted to |buffer_| + // if it was sized sufficiently large. This number can be larger than + // |size_|, if the caller provided an insufficiently large output buffer. + // But it will never be bigger than |kSSizeMax-1|. + inline ssize_t GetCount() const { + DEBUG_CHECK(count_ < kSSizeMax); + return static_cast<ssize_t>(count_); + } + + // Emits one |ch| character into the |buffer_| and updates the |count_| of + // characters that are currently supposed to be in the buffer. + // Returns "false", iff the buffer was already full. + // N.B. |count_| increases even if no characters have been written. This is + // needed so that GetCount() can return the number of bytes that should + // have been allocated for the |buffer_|. + inline bool Out(char ch) { + if (size_ >= 1 && count_ < size_) { + buffer_[count_] = ch; + return IncrementCountByOne(); + } + // |count_| still needs to be updated, even if the buffer has been + // filled completely. This allows SafeSPrintf() to return the number of + // bytes that should have been emitted. + IncrementCountByOne(); + return false; + } + + // Inserts |padding|-|len| bytes worth of padding into the |buffer_|. + // |count_| will also be incremented by the number of bytes that were meant + // to be emitted. The |pad| character is typically either a ' ' space + // or a '0' zero, but other non-NUL values are legal. + // Returns "false", iff the the |buffer_| filled up (i.e. |count_| + // overflowed |size_|) at any time during padding. + inline bool Pad(char pad, size_t padding, size_t len) { + DEBUG_CHECK(pad); + DEBUG_CHECK(padding <= kSSizeMax); + for (; padding > len; --padding) { + if (!Out(pad)) { + if (--padding) { + IncrementCount(padding-len); + } + return false; + } + } + return true; + } + + // POSIX doesn't define any async-signal-safe function for converting + // an integer to ASCII. Define our own version. + // + // This also gives us the ability to make the function a little more + // powerful and have it deal with |padding|, with truncation, and with + // predicting the length of the untruncated output. + // + // IToASCII() converts an integer |i| to ASCII. + // + // Unlike similar functions in the standard C library, it never appends a + // NUL character. This is left for the caller to do. + // + // While the function signature takes a signed int64_t, the code decides at + // run-time whether to treat the argument as signed (int64_t) or as unsigned + // (uint64_t) based on the value of |sign|. + // + // It supports |base|s 2 through 16. Only a |base| of 10 is allowed to have + // a |sign|. Otherwise, |i| is treated as unsigned. + // + // For bases larger than 10, |upcase| decides whether lower-case or upper- + // case letters should be used to designate digits greater than 10. + // + // Padding can be done with either '0' zeros or ' ' spaces. Padding has to + // be positive and will always be applied to the left of the output. + // + // Prepends a |prefix| to the number (e.g. "0x"). This prefix goes to + // the left of |padding|, if |pad| is '0'; and to the right of |padding| + // if |pad| is ' '. + // + // Returns "false", if the |buffer_| overflowed at any time. + bool IToASCII(bool sign, bool upcase, int64_t i, int base, + char pad, size_t padding, const char* prefix); + + private: + // Increments |count_| by |inc| unless this would cause |count_| to + // overflow |kSSizeMax-1|. Returns "false", iff an overflow was detected; + // it then clamps |count_| to |kSSizeMax-1|. + inline bool IncrementCount(size_t inc) { + // "inc" is either 1 or a "padding" value. Padding is clamped at + // run-time to at most kSSizeMax-1. So, we know that "inc" is always in + // the range 1..kSSizeMax-1. + // This allows us to compute "kSSizeMax - 1 - inc" without incurring any + // integer overflows. + DEBUG_CHECK(inc <= kSSizeMax - 1); + if (count_ > kSSizeMax - 1 - inc) { + count_ = kSSizeMax - 1; + return false; + } + count_ += inc; + return true; + } + + // Convenience method for the common case of incrementing |count_| by one. + inline bool IncrementCountByOne() { + return IncrementCount(1); + } + + // Return the current insertion point into the buffer. This is typically + // at |buffer_| + |count_|, but could be before that if truncation + // happened. It always points to one byte past the last byte that was + // successfully placed into the |buffer_|. + inline char* GetInsertionPoint() const { + size_t idx = count_; + if (idx > size_) { + idx = size_; + } + return buffer_ + idx; + } + + // User-provided buffer that will receive the fully formatted output string. + char* buffer_; + + // Number of bytes that are available in the buffer excluding the trailing + // NUL byte that will be added by the destructor. + const size_t size_; + + // Number of bytes that would have been emitted to the buffer, if the buffer + // was sufficiently big. This number always excludes the trailing NUL byte + // and it is guaranteed to never grow bigger than kSSizeMax-1. + size_t count_; + + DISALLOW_COPY_AND_ASSIGN(Buffer); +}; + + +bool Buffer::IToASCII(bool sign, bool upcase, int64_t i, int base, + char pad, size_t padding, const char* prefix) { + // Sanity check for parameters. None of these should ever fail, but see + // above for the rationale why we can't call GURL_CHECK(). + DEBUG_CHECK(base >= 2); + DEBUG_CHECK(base <= 16); + DEBUG_CHECK(!sign || base == 10); + DEBUG_CHECK(pad == '0' || pad == ' '); + DEBUG_CHECK(padding <= kSSizeMax); + DEBUG_CHECK(!(sign && prefix && *prefix)); + + // Handle negative numbers, if the caller indicated that |i| should be + // treated as a signed number; otherwise treat |i| as unsigned (even if the + // MSB is set!) + // Details are tricky, because of limited data-types, but equivalent pseudo- + // code would look like: + // if (sign && i < 0) + // prefix = "-"; + // num = abs(i); + int minint = 0; + uint64_t num; + if (sign && i < 0) { + prefix = "-"; + + // Turn our number positive. + if (i == std::numeric_limits<int64_t>::min()) { + // The most negative integer needs special treatment. + minint = 1; + num = static_cast<uint64_t>(-(i + 1)); + } else { + // "Normal" negative numbers are easy. + num = static_cast<uint64_t>(-i); + } + } else { + num = static_cast<uint64_t>(i); + } + + // If padding with '0' zero, emit the prefix or '-' character now. Otherwise, + // make the prefix accessible in reverse order, so that we can later output + // it right between padding and the number. + // We cannot choose the easier approach of just reversing the number, as that + // fails in situations where we need to truncate numbers that have padding + // and/or prefixes. + const char* reverse_prefix = nullptr; + if (prefix && *prefix) { + if (pad == '0') { + while (*prefix) { + if (padding) { + --padding; + } + Out(*prefix++); + } + prefix = nullptr; + } else { + for (reverse_prefix = prefix; *reverse_prefix; ++reverse_prefix) { + } + } + } else + prefix = nullptr; + const size_t prefix_length = reverse_prefix - prefix; + + // Loop until we have converted the entire number. Output at least one + // character (i.e. '0'). + size_t start = count_; + size_t discarded = 0; + bool started = false; + do { + // Make sure there is still enough space left in our output buffer. + if (count_ >= size_) { + if (start < size_) { + // It is rare that we need to output a partial number. But if asked + // to do so, we will still make sure we output the correct number of + // leading digits. + // Since we are generating the digits in reverse order, we actually + // have to discard digits in the order that we have already emitted + // them. This is essentially equivalent to: + // memmove(buffer_ + start, buffer_ + start + 1, size_ - start - 1) + for (char* move = buffer_ + start, *end = buffer_ + size_ - 1; + move < end; + ++move) { + *move = move[1]; + } + ++discarded; + --count_; + } else if (count_ - size_ > 1) { + // Need to increment either |count_| or |discarded| to make progress. + // The latter is more efficient, as it eventually triggers fast + // handling of padding. But we have to ensure we don't accidentally + // change the overall state (i.e. switch the state-machine from + // discarding to non-discarding). |count_| needs to always stay + // bigger than |size_|. + --count_; + ++discarded; + } + } + + // Output the next digit and (if necessary) compensate for the most + // negative integer needing special treatment. This works because, + // no matter the bit width of the integer, the lowest-most decimal + // integer always ends in 2, 4, 6, or 8. + if (!num && started) { + if (reverse_prefix > prefix) { + Out(*--reverse_prefix); + } else { + Out(pad); + } + } else { + started = true; + Out((upcase ? kUpCaseHexDigits : kDownCaseHexDigits)[num%base + minint]); + } + + minint = 0; + num /= base; + + // Add padding, if requested. + if (padding > 0) { + --padding; + + // Performance optimization for when we are asked to output excessive + // padding, but our output buffer is limited in size. Even if we output + // a 64bit number in binary, we would never write more than 64 plus + // prefix non-padding characters. So, once this limit has been passed, + // any further state change can be computed arithmetically; we know that + // by this time, our entire final output consists of padding characters + // that have all already been output. + if (discarded > 8*sizeof(num) + prefix_length) { + IncrementCount(padding); + padding = 0; + } + } + } while (num || padding || (reverse_prefix > prefix)); + + // Conversion to ASCII actually resulted in the digits being in reverse + // order. We can't easily generate them in forward order, as we can't tell + // the number of characters needed until we are done converting. + // So, now, we reverse the string (except for the possible '-' sign). + char* front = buffer_ + start; + char* back = GetInsertionPoint(); + while (--back > front) { + char ch = *back; + *back = *front; + *front++ = ch; + } + + IncrementCount(discarded); + return !discarded; +} + +} // anonymous namespace + +namespace internal { + +ssize_t SafeSNPrintf(char* buf, size_t sz, const char* fmt, const Arg* args, + const size_t max_args) { + // Make sure that at least one NUL byte can be written, and that the buffer + // never overflows kSSizeMax. Not only does that use up most or all of the + // address space, it also would result in a return code that cannot be + // represented. + if (static_cast<ssize_t>(sz) < 1) + return -1; + sz = std::min(sz, kSSizeMax); + + // Iterate over format string and interpret '%' arguments as they are + // encountered. + Buffer buffer(buf, sz); + size_t padding; + char pad; + for (unsigned int cur_arg = 0; *fmt && !buffer.OutOfAddressableSpace(); ) { + if (*fmt++ == '%') { + padding = 0; + pad = ' '; + char ch = *fmt++; + format_character_found: + switch (ch) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + // Found a width parameter. Convert to an integer value and store in + // "padding". If the leading digit is a zero, change the padding + // character from a space ' ' to a zero '0'. + pad = ch == '0' ? '0' : ' '; + for (;;) { + // The maximum allowed padding fills all the available address + // space and leaves just enough space to insert the trailing NUL. + const size_t max_padding = kSSizeMax - 1; + if (padding > max_padding/10 || + 10*padding > max_padding - (ch - '0')) { + DEBUG_CHECK(padding <= max_padding/10 && + 10*padding <= max_padding - (ch - '0')); + // Integer overflow detected. Skip the rest of the width until + // we find the format character, then do the normal error handling. + padding_overflow: + padding = max_padding; + while ((ch = *fmt++) >= '0' && ch <= '9') { + } + if (cur_arg < max_args) { + ++cur_arg; + } + goto fail_to_expand; + } + padding = 10*padding + ch - '0'; + if (padding > max_padding) { + // This doesn't happen for "sane" values of kSSizeMax. But once + // kSSizeMax gets smaller than about 10, our earlier range checks + // are incomplete. Unittests do trigger this artificial corner + // case. + DEBUG_CHECK(padding <= max_padding); + goto padding_overflow; + } + ch = *fmt++; + if (ch < '0' || ch > '9') { + // Reached the end of the width parameter. This is where the format + // character is found. + goto format_character_found; + } + } + break; + case 'c': { // Output an ASCII character. + // Check that there are arguments left to be inserted. + if (cur_arg >= max_args) { + DEBUG_CHECK(cur_arg < max_args); + goto fail_to_expand; + } + + // Check that the argument has the expected type. + const Arg& arg = args[cur_arg++]; + if (arg.type != Arg::INT && arg.type != Arg::UINT) { + DEBUG_CHECK(arg.type == Arg::INT || arg.type == Arg::UINT); + goto fail_to_expand; + } + + // Apply padding, if needed. + buffer.Pad(' ', padding, 1); + + // Convert the argument to an ASCII character and output it. + char as_char = static_cast<char>(arg.integer.i); + if (!as_char) { + goto end_of_output_buffer; + } + buffer.Out(as_char); + break; } + case 'd': // Output a possibly signed decimal value. + case 'o': // Output an unsigned octal value. + case 'x': // Output an unsigned hexadecimal value. + case 'X': + case 'p': { // Output a pointer value. + // Check that there are arguments left to be inserted. + if (cur_arg >= max_args) { + DEBUG_CHECK(cur_arg < max_args); + goto fail_to_expand; + } + + const Arg& arg = args[cur_arg++]; + int64_t i; + const char* prefix = nullptr; + if (ch != 'p') { + // Check that the argument has the expected type. + if (arg.type != Arg::INT && arg.type != Arg::UINT) { + DEBUG_CHECK(arg.type == Arg::INT || arg.type == Arg::UINT); + goto fail_to_expand; + } + i = arg.integer.i; + + if (ch != 'd') { + // The Arg() constructor automatically performed sign expansion on + // signed parameters. This is great when outputting a %d decimal + // number, but can result in unexpected leading 0xFF bytes when + // outputting a %x hexadecimal number. Mask bits, if necessary. + // We have to do this here, instead of in the Arg() constructor, as + // the Arg() constructor cannot tell whether we will output a %d + // or a %x. Only the latter should experience masking. + if (arg.integer.width < sizeof(int64_t)) { + i &= (1LL << (8*arg.integer.width)) - 1; + } + } + } else { + // Pointer values require an actual pointer or a string. + if (arg.type == Arg::POINTER) { + i = reinterpret_cast<uintptr_t>(arg.ptr); + } else if (arg.type == Arg::STRING) { + i = reinterpret_cast<uintptr_t>(arg.str); + } else if (arg.type == Arg::INT && + arg.integer.width == sizeof(NULL) && + arg.integer.i == 0) { // Allow C++'s version of NULL + i = 0; + } else { + DEBUG_CHECK(arg.type == Arg::POINTER || arg.type == Arg::STRING); + goto fail_to_expand; + } + + // Pointers always include the "0x" prefix. + prefix = "0x"; + } + + // Use IToASCII() to convert to ASCII representation. For decimal + // numbers, optionally print a sign. For hexadecimal numbers, + // distinguish between upper and lower case. %p addresses are always + // printed as upcase. Supports base 8, 10, and 16. Prints padding + // and/or prefixes, if so requested. + buffer.IToASCII(ch == 'd' && arg.type == Arg::INT, + ch != 'x', i, + ch == 'o' ? 8 : ch == 'd' ? 10 : 16, + pad, padding, prefix); + break; } + case 's': { + // Check that there are arguments left to be inserted. + if (cur_arg >= max_args) { + DEBUG_CHECK(cur_arg < max_args); + goto fail_to_expand; + } + + // Check that the argument has the expected type. + const Arg& arg = args[cur_arg++]; + const char *s; + if (arg.type == Arg::STRING) { + s = arg.str ? arg.str : "<NULL>"; + } else if (arg.type == Arg::INT && arg.integer.width == sizeof(NULL) && + arg.integer.i == 0) { // Allow C++'s version of NULL + s = "<NULL>"; + } else { + DEBUG_CHECK(arg.type == Arg::STRING); + goto fail_to_expand; + } + + // Apply padding, if needed. This requires us to first check the + // length of the string that we are outputting. + if (padding) { + size_t len = 0; + for (const char* src = s; *src++; ) { + ++len; + } + buffer.Pad(' ', padding, len); + } + + // Printing a string involves nothing more than copying it into the + // output buffer and making sure we don't output more bytes than + // available space; Out() takes care of doing that. + for (const char* src = s; *src; ) { + buffer.Out(*src++); + } + break; } + case '%': + // Quoted percent '%' character. + goto copy_verbatim; + fail_to_expand: + // C++ gives us tools to do type checking -- something that snprintf() + // could never really do. So, whenever we see arguments that don't + // match up with the format string, we refuse to output them. But + // since we have to be extremely conservative about being async- + // signal-safe, we are limited in the type of error handling that we + // can do in production builds (in debug builds we can use + // DEBUG_CHECK() and hope for the best). So, all we do is pass the + // format string unchanged. That should eventually get the user's + // attention; and in the meantime, it hopefully doesn't lose too much + // data. + default: + // Unknown or unsupported format character. Just copy verbatim to + // output. + buffer.Out('%'); + DEBUG_CHECK(ch); + if (!ch) { + goto end_of_format_string; + } + buffer.Out(ch); + break; + } + } else { + copy_verbatim: + buffer.Out(fmt[-1]); + } + } + end_of_format_string: + end_of_output_buffer: + return buffer.GetCount(); +} + +} // namespace internal + +ssize_t SafeSNPrintf(char* buf, size_t sz, const char* fmt) { + // Make sure that at least one NUL byte can be written, and that the buffer + // never overflows kSSizeMax. Not only does that use up most or all of the + // address space, it also would result in a return code that cannot be + // represented. + if (static_cast<ssize_t>(sz) < 1) + return -1; + sz = std::min(sz, kSSizeMax); + + Buffer buffer(buf, sz); + + // In the slow-path, we deal with errors by copying the contents of + // "fmt" unexpanded. This means, if there are no arguments passed, the + // SafeSPrintf() function always degenerates to a version of strncpy() that + // de-duplicates '%' characters. + const char* src = fmt; + for (; *src; ++src) { + buffer.Out(*src); + DEBUG_CHECK(src[0] != '%' || src[1] == '%'); + if (src[0] == '%' && src[1] == '%') { + ++src; + } + } + return buffer.GetCount(); +} + +} // namespace strings +} // namespace base
diff --git a/base/strings/safe_sprintf.h b/base/strings/safe_sprintf.h new file mode 100644 index 0000000..92f8c59 --- /dev/null +++ b/base/strings/safe_sprintf.h
@@ -0,0 +1,246 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_SAFE_SPRINTF_H_ +#define BASE_STRINGS_SAFE_SPRINTF_H_ + +#include "build/build_config.h" + +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> + +#if defined(OS_POSIX) || defined(OS_FUCHSIA) +// For ssize_t +#include <unistd.h> +#endif + +#include "polyfills/base/base_export.h" + +namespace gurl_base { +namespace strings { + +#if defined(COMPILER_MSVC) +// Define ssize_t inside of our namespace. +#if defined(_WIN64) +typedef __int64 ssize_t; +#else +typedef long ssize_t; +#endif +#endif + +// SafeSPrintf() is a type-safe and completely self-contained version of +// snprintf(). +// +// SafeSNPrintf() is an alternative function signature that can be used when +// not dealing with fixed-sized buffers. When possible, SafeSPrintf() should +// always be used instead of SafeSNPrintf() +// +// These functions allow for formatting complicated messages from contexts that +// require strict async-signal-safety. In fact, it is safe to call them from +// any low-level execution context, as they are guaranteed to make no library +// or system calls. It deliberately never touches "errno", either. +// +// The only exception to this rule is that in debug builds the code calls +// RAW_CHECK() to help diagnose problems when the format string does not +// match the rest of the arguments. In release builds, no GURL_CHECK()s are used, +// and SafeSPrintf() instead returns an output string that expands only +// those arguments that match their format characters. Mismatched arguments +// are ignored. +// +// The code currently only supports a subset of format characters: +// %c, %o, %d, %x, %X, %p, and %s. +// +// SafeSPrintf() aims to be as liberal as reasonably possible. Integer-like +// values of arbitrary width can be passed to all of the format characters +// that expect integers. Thus, it is explicitly legal to pass an "int" to +// "%c", and output will automatically look at the LSB only. It is also +// explicitly legal to pass either signed or unsigned values, and the format +// characters will automatically interpret the arguments accordingly. +// +// It is still not legal to mix-and-match integer-like values with pointer +// values. For instance, you cannot pass a pointer to %x, nor can you pass an +// integer to %p. +// +// The one exception is "0" zero being accepted by "%p". This works-around +// the problem of C++ defining NULL as an integer-like value. +// +// All format characters take an optional width parameter. This must be a +// positive integer. For %d, %o, %x, %X and %p, if the width starts with +// a leading '0', padding is done with '0' instead of ' ' characters. +// +// There are a few features of snprintf()-style format strings, that +// SafeSPrintf() does not support at this time. +// +// If an actual user showed up, there is no particularly strong reason they +// couldn't be added. But that assumes that the trade-offs between complexity +// and utility are favorable. +// +// For example, adding support for negative padding widths, and for %n are all +// likely to be viewed positively. They are all clearly useful, low-risk, easy +// to test, don't jeopardize the async-signal-safety of the code, and overall +// have little impact on other parts of SafeSPrintf() function. +// +// On the other hands, adding support for alternate forms, positional +// arguments, grouping, wide characters, localization or floating point numbers +// are all unlikely to ever be added. +// +// SafeSPrintf() and SafeSNPrintf() mimic the behavior of snprintf() and they +// return the number of bytes needed to store the untruncated output. This +// does *not* include the terminating NUL byte. +// +// They return -1, iff a fatal error happened. This typically can only happen, +// if the buffer size is a) negative, or b) zero (i.e. not even the NUL byte +// can be written). The return value can never be larger than SSIZE_MAX-1. +// This ensures that the caller can always add one to the signed return code +// in order to determine the amount of storage that needs to be allocated. +// +// While the code supports type checking and while it is generally very careful +// to avoid printing incorrect values, it tends to be conservative in printing +// as much as possible, even when given incorrect parameters. Typically, in +// case of an error, the format string will not be expanded. (i.e. something +// like SafeSPrintf(buf, "%p %d", 1, 2) results in "%p 2"). See above for +// the use of RAW_CHECK() in debug builds, though. +// +// Basic example: +// char buf[20]; +// gurl_base::strings::SafeSPrintf(buf, "The answer: %2d", 42); +// +// Example with dynamically sized buffer (async-signal-safe). This code won't +// work on Visual studio, as it requires dynamically allocating arrays on the +// stack. Consider picking a smaller value for |kMaxSize| if stack size is +// limited and known. On the other hand, if the parameters to SafeSNPrintf() +// are trusted and not controllable by the user, you can consider eliminating +// the check for |kMaxSize| altogether. The current value of SSIZE_MAX is +// essentially a no-op that just illustrates how to implement an upper bound: +// const size_t kInitialSize = 128; +// const size_t kMaxSize = std::numeric_limits<ssize_t>::max(); +// size_t size = kInitialSize; +// for (;;) { +// char buf[size]; +// size = SafeSNPrintf(buf, size, "Error message \"%s\"\n", err) + 1; +// if (sizeof(buf) < kMaxSize && size > kMaxSize) { +// size = kMaxSize; +// continue; +// } else if (size > sizeof(buf)) +// continue; +// write(2, buf, size-1); +// break; +// } + +namespace internal { +// Helpers that use C++ overloading, templates, and specializations to deduce +// and record type information from function arguments. This allows us to +// later write a type-safe version of snprintf(). + +struct Arg { + enum Type { INT, UINT, STRING, POINTER }; + + // Any integer-like value. + Arg(signed char c) : type(INT) { + integer.i = c; + integer.width = sizeof(char); + } + Arg(unsigned char c) : type(UINT) { + integer.i = c; + integer.width = sizeof(char); + } + Arg(signed short j) : type(INT) { + integer.i = j; + integer.width = sizeof(short); + } + Arg(unsigned short j) : type(UINT) { + integer.i = j; + integer.width = sizeof(short); + } + Arg(signed int j) : type(INT) { + integer.i = j; + integer.width = sizeof(int); + } + Arg(unsigned int j) : type(UINT) { + integer.i = j; + integer.width = sizeof(int); + } + Arg(signed long j) : type(INT) { + integer.i = j; + integer.width = sizeof(long); + } + Arg(unsigned long j) : type(UINT) { + integer.i = j; + integer.width = sizeof(long); + } + Arg(signed long long j) : type(INT) { + integer.i = j; + integer.width = sizeof(long long); + } + Arg(unsigned long long j) : type(UINT) { + integer.i = j; + integer.width = sizeof(long long); + } + + // A C-style text string. + Arg(const char* s) : str(s), type(STRING) { } + Arg(char* s) : str(s), type(STRING) { } + + // Any pointer value that can be cast to a "void*". + template<class T> Arg(T* p) : ptr((void*)p), type(POINTER) { } + + union { + // An integer-like value. + struct { + int64_t i; + unsigned char width; + } integer; + + // A C-style text string. + const char* str; + + // A pointer to an arbitrary object. + const void* ptr; + }; + const enum Type type; +}; + +// This is the internal function that performs the actual formatting of +// an snprintf()-style format string. +BASE_EXPORT ssize_t SafeSNPrintf(char* buf, size_t sz, const char* fmt, + const Arg* args, size_t max_args); + +#if !defined(NDEBUG) +// In debug builds, allow unit tests to artificially lower the kSSizeMax +// constant that is used as a hard upper-bound for all buffers. In normal +// use, this constant should always be std::numeric_limits<ssize_t>::max(). +BASE_EXPORT void SetSafeSPrintfSSizeMaxForTest(size_t max); +BASE_EXPORT size_t GetSafeSPrintfSSizeMaxForTest(); +#endif + +} // namespace internal + +template<typename... Args> +ssize_t SafeSNPrintf(char* buf, size_t N, const char* fmt, Args... args) { + // Use Arg() object to record type information and then copy arguments to an + // array to make it easier to iterate over them. + const internal::Arg arg_array[] = { args... }; + return internal::SafeSNPrintf(buf, N, fmt, arg_array, sizeof...(args)); +} + +template<size_t N, typename... Args> +ssize_t SafeSPrintf(char (&buf)[N], const char* fmt, Args... args) { + // Use Arg() object to record type information and then copy arguments to an + // array to make it easier to iterate over them. + const internal::Arg arg_array[] = { args... }; + return internal::SafeSNPrintf(buf, N, fmt, arg_array, sizeof...(args)); +} + +// Fast-path when we don't actually need to substitute any arguments. +BASE_EXPORT ssize_t SafeSNPrintf(char* buf, size_t N, const char* fmt); +template<size_t N> +inline ssize_t SafeSPrintf(char (&buf)[N], const char* fmt) { + return SafeSNPrintf(buf, N, fmt); +} + +} // namespace strings +} // namespace base + +#endif // BASE_STRINGS_SAFE_SPRINTF_H_
diff --git a/base/strings/safe_sprintf_unittest.cc b/base/strings/safe_sprintf_unittest.cc new file mode 100644 index 0000000..b7a67fa --- /dev/null +++ b/base/strings/safe_sprintf_unittest.cc
@@ -0,0 +1,765 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/safe_sprintf.h" + +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> + +#include <limits> +#include <memory> + +#include "polyfills/base/logging.h" +#include "base/macros.h" +#include "build/build_config.h" +#include "testing/gtest/include/gtest/gtest.h" + +// Death tests on Android are currently very flaky. No need to add more flaky +// tests, as they just make it hard to spot real problems. +// TODO(markus): See if the restrictions on Android can eventually be lifted. +#if defined(GTEST_HAS_DEATH_TEST) && !defined(OS_ANDROID) +#define ALLOW_DEATH_TEST +#endif + +namespace gurl_base { +namespace strings { + +TEST(SafeSPrintfTest, Empty) { + char buf[2] = { 'X', 'X' }; + + // Negative buffer size should always result in an error. + EXPECT_EQ(-1, SafeSNPrintf(buf, static_cast<size_t>(-1), "")); + EXPECT_EQ('X', buf[0]); + EXPECT_EQ('X', buf[1]); + + // Zero buffer size should always result in an error. + EXPECT_EQ(-1, SafeSNPrintf(buf, 0, "")); + EXPECT_EQ('X', buf[0]); + EXPECT_EQ('X', buf[1]); + + // A one-byte buffer should always print a single NUL byte. + EXPECT_EQ(0, SafeSNPrintf(buf, 1, "")); + EXPECT_EQ(0, buf[0]); + EXPECT_EQ('X', buf[1]); + buf[0] = 'X'; + + // A larger buffer should leave the trailing bytes unchanged. + EXPECT_EQ(0, SafeSNPrintf(buf, 2, "")); + EXPECT_EQ(0, buf[0]); + EXPECT_EQ('X', buf[1]); + buf[0] = 'X'; + + // The same test using SafeSPrintf() instead of SafeSNPrintf(). + EXPECT_EQ(0, SafeSPrintf(buf, "")); + EXPECT_EQ(0, buf[0]); + EXPECT_EQ('X', buf[1]); + buf[0] = 'X'; +} + +TEST(SafeSPrintfTest, NoArguments) { + // Output a text message that doesn't require any substitutions. This + // is roughly equivalent to calling strncpy() (but unlike strncpy(), it does + // always add a trailing NUL; it always deduplicates '%' characters). + static const char text[] = "hello world"; + char ref[20], buf[20]; + memset(ref, 'X', sizeof(ref)); + memcpy(buf, ref, sizeof(buf)); + + // A negative buffer size should always result in an error. + EXPECT_EQ(-1, SafeSNPrintf(buf, static_cast<size_t>(-1), text)); + EXPECT_TRUE(!memcmp(buf, ref, sizeof(buf))); + + // Zero buffer size should always result in an error. + EXPECT_EQ(-1, SafeSNPrintf(buf, 0, text)); + EXPECT_TRUE(!memcmp(buf, ref, sizeof(buf))); + + // A one-byte buffer should always print a single NUL byte. + EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1, SafeSNPrintf(buf, 1, text)); + EXPECT_EQ(0, buf[0]); + EXPECT_TRUE(!memcmp(buf+1, ref+1, sizeof(buf)-1)); + memcpy(buf, ref, sizeof(buf)); + + // A larger (but limited) buffer should always leave the trailing bytes + // unchanged. + EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1, SafeSNPrintf(buf, 2, text)); + EXPECT_EQ(text[0], buf[0]); + EXPECT_EQ(0, buf[1]); + EXPECT_TRUE(!memcmp(buf+2, ref+2, sizeof(buf)-2)); + memcpy(buf, ref, sizeof(buf)); + + // A unrestricted buffer length should always leave the trailing bytes + // unchanged. + EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1, + SafeSNPrintf(buf, sizeof(buf), text)); + EXPECT_EQ(std::string(text), std::string(buf)); + EXPECT_TRUE(!memcmp(buf + sizeof(text), ref + sizeof(text), + sizeof(buf) - sizeof(text))); + memcpy(buf, ref, sizeof(buf)); + + // The same test using SafeSPrintf() instead of SafeSNPrintf(). + EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1, SafeSPrintf(buf, text)); + EXPECT_EQ(std::string(text), std::string(buf)); + EXPECT_TRUE(!memcmp(buf + sizeof(text), ref + sizeof(text), + sizeof(buf) - sizeof(text))); + memcpy(buf, ref, sizeof(buf)); + + // Check for deduplication of '%' percent characters. + EXPECT_EQ(1, SafeSPrintf(buf, "%%")); + EXPECT_EQ(2, SafeSPrintf(buf, "%%%%")); + EXPECT_EQ(2, SafeSPrintf(buf, "%%X")); + EXPECT_EQ(3, SafeSPrintf(buf, "%%%%X")); +#if defined(NDEBUG) + EXPECT_EQ(1, SafeSPrintf(buf, "%")); + EXPECT_EQ(2, SafeSPrintf(buf, "%%%")); + EXPECT_EQ(2, SafeSPrintf(buf, "%X")); + EXPECT_EQ(3, SafeSPrintf(buf, "%%%X")); +#elif defined(ALLOW_DEATH_TEST) + EXPECT_DEATH(SafeSPrintf(buf, "%"), "src.1. == '%'"); + EXPECT_DEATH(SafeSPrintf(buf, "%%%"), "src.1. == '%'"); + EXPECT_DEATH(SafeSPrintf(buf, "%X"), "src.1. == '%'"); + EXPECT_DEATH(SafeSPrintf(buf, "%%%X"), "src.1. == '%'"); +#endif +} + +TEST(SafeSPrintfTest, OneArgument) { + // Test basic single-argument single-character substitution. + const char text[] = "hello world"; + const char fmt[] = "hello%cworld"; + char ref[20], buf[20]; + memset(ref, 'X', sizeof(buf)); + memcpy(buf, ref, sizeof(buf)); + + // A negative buffer size should always result in an error. + EXPECT_EQ(-1, SafeSNPrintf(buf, static_cast<size_t>(-1), fmt, ' ')); + EXPECT_TRUE(!memcmp(buf, ref, sizeof(buf))); + + // Zero buffer size should always result in an error. + EXPECT_EQ(-1, SafeSNPrintf(buf, 0, fmt, ' ')); + EXPECT_TRUE(!memcmp(buf, ref, sizeof(buf))); + + // A one-byte buffer should always print a single NUL byte. + EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1, + SafeSNPrintf(buf, 1, fmt, ' ')); + EXPECT_EQ(0, buf[0]); + EXPECT_TRUE(!memcmp(buf+1, ref+1, sizeof(buf)-1)); + memcpy(buf, ref, sizeof(buf)); + + // A larger (but limited) buffer should always leave the trailing bytes + // unchanged. + EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1, + SafeSNPrintf(buf, 2, fmt, ' ')); + EXPECT_EQ(text[0], buf[0]); + EXPECT_EQ(0, buf[1]); + EXPECT_TRUE(!memcmp(buf+2, ref+2, sizeof(buf)-2)); + memcpy(buf, ref, sizeof(buf)); + + // A unrestricted buffer length should always leave the trailing bytes + // unchanged. + EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1, + SafeSNPrintf(buf, sizeof(buf), fmt, ' ')); + EXPECT_EQ(std::string(text), std::string(buf)); + EXPECT_TRUE(!memcmp(buf + sizeof(text), ref + sizeof(text), + sizeof(buf) - sizeof(text))); + memcpy(buf, ref, sizeof(buf)); + + // The same test using SafeSPrintf() instead of SafeSNPrintf(). + EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1, SafeSPrintf(buf, fmt, ' ')); + EXPECT_EQ(std::string(text), std::string(buf)); + EXPECT_TRUE(!memcmp(buf + sizeof(text), ref + sizeof(text), + sizeof(buf) - sizeof(text))); + memcpy(buf, ref, sizeof(buf)); + + // Check for deduplication of '%' percent characters. + EXPECT_EQ(1, SafeSPrintf(buf, "%%", 0)); + EXPECT_EQ(2, SafeSPrintf(buf, "%%%%", 0)); + EXPECT_EQ(2, SafeSPrintf(buf, "%Y", 0)); + EXPECT_EQ(2, SafeSPrintf(buf, "%%Y", 0)); + EXPECT_EQ(3, SafeSPrintf(buf, "%%%Y", 0)); + EXPECT_EQ(3, SafeSPrintf(buf, "%%%%Y", 0)); +#if defined(NDEBUG) + EXPECT_EQ(1, SafeSPrintf(buf, "%", 0)); + EXPECT_EQ(2, SafeSPrintf(buf, "%%%", 0)); +#elif defined(ALLOW_DEATH_TEST) + EXPECT_DEATH(SafeSPrintf(buf, "%", 0), "ch"); + EXPECT_DEATH(SafeSPrintf(buf, "%%%", 0), "ch"); +#endif +} + +TEST(SafeSPrintfTest, MissingArg) { +#if defined(NDEBUG) + char buf[20]; + EXPECT_EQ(3, SafeSPrintf(buf, "%c%c", 'A')); + EXPECT_EQ("A%c", std::string(buf)); +#elif defined(ALLOW_DEATH_TEST) + char buf[20]; + EXPECT_DEATH(SafeSPrintf(buf, "%c%c", 'A'), "cur_arg < max_args"); +#endif +} + +TEST(SafeSPrintfTest, ASANFriendlyBufferTest) { + // Print into a buffer that is sized exactly to size. ASAN can verify that + // nobody attempts to write past the end of the buffer. + // There is a more complicated test in PrintLongString() that covers a lot + // more edge case, but it is also harder to debug in case of a failure. + const char kTestString[] = "This is a test"; + std::unique_ptr<char[]> buf(new char[sizeof(kTestString)]); + EXPECT_EQ(static_cast<ssize_t>(sizeof(kTestString) - 1), + SafeSNPrintf(buf.get(), sizeof(kTestString), kTestString)); + EXPECT_EQ(std::string(kTestString), std::string(buf.get())); + EXPECT_EQ(static_cast<ssize_t>(sizeof(kTestString) - 1), + SafeSNPrintf(buf.get(), sizeof(kTestString), "%s", kTestString)); + EXPECT_EQ(std::string(kTestString), std::string(buf.get())); +} + +TEST(SafeSPrintfTest, NArgs) { + // Pre-C++11 compilers have a different code path, that can only print + // up to ten distinct arguments. + // We test both SafeSPrintf() and SafeSNPrintf(). This makes sure we don't + // have typos in the copy-n-pasted code that is needed to deal with various + // numbers of arguments. + char buf[12]; + EXPECT_EQ(1, SafeSPrintf(buf, "%c", 1)); + EXPECT_EQ("\1", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%c%c", 1, 2)); + EXPECT_EQ("\1\2", std::string(buf)); + EXPECT_EQ(3, SafeSPrintf(buf, "%c%c%c", 1, 2, 3)); + EXPECT_EQ("\1\2\3", std::string(buf)); + EXPECT_EQ(4, SafeSPrintf(buf, "%c%c%c%c", 1, 2, 3, 4)); + EXPECT_EQ("\1\2\3\4", std::string(buf)); + EXPECT_EQ(5, SafeSPrintf(buf, "%c%c%c%c%c", 1, 2, 3, 4, 5)); + EXPECT_EQ("\1\2\3\4\5", std::string(buf)); + EXPECT_EQ(6, SafeSPrintf(buf, "%c%c%c%c%c%c", 1, 2, 3, 4, 5, 6)); + EXPECT_EQ("\1\2\3\4\5\6", std::string(buf)); + EXPECT_EQ(7, SafeSPrintf(buf, "%c%c%c%c%c%c%c", 1, 2, 3, 4, 5, 6, 7)); + EXPECT_EQ("\1\2\3\4\5\6\7", std::string(buf)); + EXPECT_EQ(8, SafeSPrintf(buf, "%c%c%c%c%c%c%c%c", 1, 2, 3, 4, 5, 6, 7, 8)); + EXPECT_EQ("\1\2\3\4\5\6\7\10", std::string(buf)); + EXPECT_EQ(9, SafeSPrintf(buf, "%c%c%c%c%c%c%c%c%c", + 1, 2, 3, 4, 5, 6, 7, 8, 9)); + EXPECT_EQ("\1\2\3\4\5\6\7\10\11", std::string(buf)); + EXPECT_EQ(10, SafeSPrintf(buf, "%c%c%c%c%c%c%c%c%c%c", + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)); + + // Repeat all the tests with SafeSNPrintf() instead of SafeSPrintf(). + EXPECT_EQ("\1\2\3\4\5\6\7\10\11\12", std::string(buf)); + EXPECT_EQ(1, SafeSNPrintf(buf, 11, "%c", 1)); + EXPECT_EQ("\1", std::string(buf)); + EXPECT_EQ(2, SafeSNPrintf(buf, 11, "%c%c", 1, 2)); + EXPECT_EQ("\1\2", std::string(buf)); + EXPECT_EQ(3, SafeSNPrintf(buf, 11, "%c%c%c", 1, 2, 3)); + EXPECT_EQ("\1\2\3", std::string(buf)); + EXPECT_EQ(4, SafeSNPrintf(buf, 11, "%c%c%c%c", 1, 2, 3, 4)); + EXPECT_EQ("\1\2\3\4", std::string(buf)); + EXPECT_EQ(5, SafeSNPrintf(buf, 11, "%c%c%c%c%c", 1, 2, 3, 4, 5)); + EXPECT_EQ("\1\2\3\4\5", std::string(buf)); + EXPECT_EQ(6, SafeSNPrintf(buf, 11, "%c%c%c%c%c%c", 1, 2, 3, 4, 5, 6)); + EXPECT_EQ("\1\2\3\4\5\6", std::string(buf)); + EXPECT_EQ(7, SafeSNPrintf(buf, 11, "%c%c%c%c%c%c%c", 1, 2, 3, 4, 5, 6, 7)); + EXPECT_EQ("\1\2\3\4\5\6\7", std::string(buf)); + EXPECT_EQ(8, SafeSNPrintf(buf, 11, "%c%c%c%c%c%c%c%c", + 1, 2, 3, 4, 5, 6, 7, 8)); + EXPECT_EQ("\1\2\3\4\5\6\7\10", std::string(buf)); + EXPECT_EQ(9, SafeSNPrintf(buf, 11, "%c%c%c%c%c%c%c%c%c", + 1, 2, 3, 4, 5, 6, 7, 8, 9)); + EXPECT_EQ("\1\2\3\4\5\6\7\10\11", std::string(buf)); + EXPECT_EQ(10, SafeSNPrintf(buf, 11, "%c%c%c%c%c%c%c%c%c%c", + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)); + EXPECT_EQ("\1\2\3\4\5\6\7\10\11\12", std::string(buf)); + + EXPECT_EQ(11, SafeSPrintf(buf, "%c%c%c%c%c%c%c%c%c%c%c", + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)); + EXPECT_EQ("\1\2\3\4\5\6\7\10\11\12\13", std::string(buf)); + EXPECT_EQ(11, SafeSNPrintf(buf, 12, "%c%c%c%c%c%c%c%c%c%c%c", + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)); + EXPECT_EQ("\1\2\3\4\5\6\7\10\11\12\13", std::string(buf)); +} + +TEST(SafeSPrintfTest, DataTypes) { + char buf[40]; + + // Bytes + EXPECT_EQ(1, SafeSPrintf(buf, "%d", (uint8_t)1)); + EXPECT_EQ("1", std::string(buf)); + EXPECT_EQ(3, SafeSPrintf(buf, "%d", (uint8_t)-1)); + EXPECT_EQ("255", std::string(buf)); + EXPECT_EQ(1, SafeSPrintf(buf, "%d", (int8_t)1)); + EXPECT_EQ("1", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%d", (int8_t)-1)); + EXPECT_EQ("-1", std::string(buf)); + EXPECT_EQ(4, SafeSPrintf(buf, "%d", (int8_t)-128)); + EXPECT_EQ("-128", std::string(buf)); + + // Half-words + EXPECT_EQ(1, SafeSPrintf(buf, "%d", (uint16_t)1)); + EXPECT_EQ("1", std::string(buf)); + EXPECT_EQ(5, SafeSPrintf(buf, "%d", (uint16_t)-1)); + EXPECT_EQ("65535", std::string(buf)); + EXPECT_EQ(1, SafeSPrintf(buf, "%d", (int16_t)1)); + EXPECT_EQ("1", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%d", (int16_t)-1)); + EXPECT_EQ("-1", std::string(buf)); + EXPECT_EQ(6, SafeSPrintf(buf, "%d", (int16_t)-32768)); + EXPECT_EQ("-32768", std::string(buf)); + + // Words + EXPECT_EQ(1, SafeSPrintf(buf, "%d", (uint32_t)1)); + EXPECT_EQ("1", std::string(buf)); + EXPECT_EQ(10, SafeSPrintf(buf, "%d", (uint32_t)-1)); + EXPECT_EQ("4294967295", std::string(buf)); + EXPECT_EQ(1, SafeSPrintf(buf, "%d", (int32_t)1)); + EXPECT_EQ("1", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%d", (int32_t)-1)); + EXPECT_EQ("-1", std::string(buf)); + // Work-around for an limitation of C90 + EXPECT_EQ(11, SafeSPrintf(buf, "%d", (int32_t)-2147483647-1)); + EXPECT_EQ("-2147483648", std::string(buf)); + + // Quads + EXPECT_EQ(1, SafeSPrintf(buf, "%d", (uint64_t)1)); + EXPECT_EQ("1", std::string(buf)); + EXPECT_EQ(20, SafeSPrintf(buf, "%d", (uint64_t)-1)); + EXPECT_EQ("18446744073709551615", std::string(buf)); + EXPECT_EQ(1, SafeSPrintf(buf, "%d", (int64_t)1)); + EXPECT_EQ("1", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%d", (int64_t)-1)); + EXPECT_EQ("-1", std::string(buf)); + // Work-around for an limitation of C90 + EXPECT_EQ(20, SafeSPrintf(buf, "%d", (int64_t)-9223372036854775807LL-1)); + EXPECT_EQ("-9223372036854775808", std::string(buf)); + + // Strings (both const and mutable). + EXPECT_EQ(4, SafeSPrintf(buf, "test")); + EXPECT_EQ("test", std::string(buf)); + EXPECT_EQ(4, SafeSPrintf(buf, buf)); + EXPECT_EQ("test", std::string(buf)); + + // Pointer + char addr[20]; + sprintf(addr, "0x%llX", (unsigned long long)(uintptr_t)buf); + SafeSPrintf(buf, "%p", buf); + EXPECT_EQ(std::string(addr), std::string(buf)); + SafeSPrintf(buf, "%p", (const char *)buf); + EXPECT_EQ(std::string(addr), std::string(buf)); + sprintf(addr, "0x%llX", (unsigned long long)(uintptr_t)sprintf); + SafeSPrintf(buf, "%p", sprintf); + EXPECT_EQ(std::string(addr), std::string(buf)); + + // Padding for pointers is a little more complicated because of the "0x" + // prefix. Padding with '0' zeros is relatively straight-forward, but + // padding with ' ' spaces requires more effort. + sprintf(addr, "0x%017llX", (unsigned long long)(uintptr_t)buf); + SafeSPrintf(buf, "%019p", buf); + EXPECT_EQ(std::string(addr), std::string(buf)); + sprintf(addr, "0x%llX", (unsigned long long)(uintptr_t)buf); + memset(addr, ' ', + (char*)memmove(addr + sizeof(addr) - strlen(addr) - 1, + addr, strlen(addr)+1) - addr); + SafeSPrintf(buf, "%19p", buf); + EXPECT_EQ(std::string(addr), std::string(buf)); +} + +namespace { +void PrintLongString(char* buf, size_t sz) { + // Output a reasonably complex expression into a limited-size buffer. + // At least one byte is available for writing the NUL character. + GURL_CHECK_GT(sz, static_cast<size_t>(0)); + + // Allocate slightly more space, so that we can verify that SafeSPrintf() + // never writes past the end of the buffer. + std::unique_ptr<char[]> tmp(new char[sz + 2]); + memset(tmp.get(), 'X', sz+2); + + // Use SafeSPrintf() to output a complex list of arguments: + // - test padding and truncating %c single characters. + // - test truncating %s simple strings. + // - test mismatching arguments and truncating (for %d != %s). + // - test zero-padding and truncating %x hexadecimal numbers. + // - test outputting and truncating %d MININT. + // - test outputting and truncating %p arbitrary pointer values. + // - test outputting, padding and truncating NULL-pointer %s strings. + char* out = tmp.get(); + size_t out_sz = sz; + size_t len; + for (std::unique_ptr<char[]> perfect_buf;;) { + size_t needed = + SafeSNPrintf(out, out_sz, +#if defined(NDEBUG) + "A%2cong %s: %d %010X %d %p%7s", 'l', "string", "", +#else + "A%2cong %s: %%d %010X %d %p%7s", 'l', "string", +#endif + 0xDEADBEEF, std::numeric_limits<intptr_t>::min(), + PrintLongString, static_cast<char*>(nullptr)) + + 1; + + // Various sanity checks: + // The numbered of characters needed to print the full string should always + // be bigger or equal to the bytes that have actually been output. + len = strlen(tmp.get()); + GURL_CHECK_GE(needed, len+1); + + // The number of characters output should always fit into the buffer that + // was passed into SafeSPrintf(). + GURL_CHECK_LT(len, out_sz); + + // The output is always terminated with a NUL byte (actually, this test is + // always going to pass, as strlen() already verified this) + EXPECT_FALSE(tmp[len]); + + // ASAN can check that we are not overwriting buffers, iff we make sure the + // buffer is exactly the size that we are expecting to be written. After + // running SafeSNPrintf() the first time, it is possible to compute the + // correct buffer size for this test. So, allocate a second buffer and run + // the exact same SafeSNPrintf() command again. + if (!perfect_buf.get()) { + out_sz = std::min(needed, sz); + out = new char[out_sz]; + perfect_buf.reset(out); + } else { + break; + } + } + + // All trailing bytes are unchanged. + for (size_t i = len+1; i < sz+2; ++i) + EXPECT_EQ('X', tmp[i]); + + // The text that was generated by SafeSPrintf() should always match the + // equivalent text generated by sprintf(). Please note that the format + // string for sprintf() is not complicated, as it does not have the + // benefit of getting type information from the C++ compiler. + // + // N.B.: It would be so much cleaner to use snprintf(). But unfortunately, + // Visual Studio doesn't support this function, and the work-arounds + // are all really awkward. + char ref[256]; + GURL_CHECK_LE(sz, sizeof(ref)); + sprintf(ref, "A long string: %%d 00DEADBEEF %lld 0x%llX <NULL>", + static_cast<long long>(std::numeric_limits<intptr_t>::min()), + static_cast<unsigned long long>( + reinterpret_cast<uintptr_t>(PrintLongString))); + ref[sz-1] = '\000'; + +#if defined(NDEBUG) + const size_t kSSizeMax = std::numeric_limits<ssize_t>::max(); +#else + const size_t kSSizeMax = internal::GetSafeSPrintfSSizeMaxForTest(); +#endif + + // Compare the output from SafeSPrintf() to the one from sprintf(). + EXPECT_EQ(std::string(ref).substr(0, kSSizeMax-1), std::string(tmp.get())); + + // We allocated a slightly larger buffer, so that we could perform some + // extra sanity checks. Now that the tests have all passed, we copy the + // data to the output buffer that the caller provided. + memcpy(buf, tmp.get(), len+1); +} + +#if !defined(NDEBUG) +class ScopedSafeSPrintfSSizeMaxSetter { + public: + ScopedSafeSPrintfSSizeMaxSetter(size_t sz) { + old_ssize_max_ = internal::GetSafeSPrintfSSizeMaxForTest(); + internal::SetSafeSPrintfSSizeMaxForTest(sz); + } + + ~ScopedSafeSPrintfSSizeMaxSetter() { + internal::SetSafeSPrintfSSizeMaxForTest(old_ssize_max_); + } + + private: + size_t old_ssize_max_; + + DISALLOW_COPY_AND_ASSIGN(ScopedSafeSPrintfSSizeMaxSetter); +}; +#endif + +} // anonymous namespace + +TEST(SafeSPrintfTest, Truncation) { + // We use PrintLongString() to print a complex long string and then + // truncate to all possible lengths. This ends up exercising a lot of + // different code paths in SafeSPrintf() and IToASCII(), as truncation can + // happen in a lot of different states. + char ref[256]; + PrintLongString(ref, sizeof(ref)); + for (size_t i = strlen(ref)+1; i; --i) { + char buf[sizeof(ref)]; + PrintLongString(buf, i); + EXPECT_EQ(std::string(ref, i - 1), std::string(buf)); + } + + // When compiling in debug mode, we have the ability to fake a small + // upper limit for the maximum value that can be stored in an ssize_t. + // SafeSPrintf() uses this upper limit to determine how many bytes it will + // write to the buffer, even if the caller claimed a bigger buffer size. + // Repeat the truncation test and verify that this other code path in + // SafeSPrintf() works correctly, too. +#if !defined(NDEBUG) + for (size_t i = strlen(ref)+1; i > 1; --i) { + ScopedSafeSPrintfSSizeMaxSetter ssize_max_setter(i); + char buf[sizeof(ref)]; + PrintLongString(buf, sizeof(buf)); + EXPECT_EQ(std::string(ref, i - 1), std::string(buf)); + } + + // kSSizeMax is also used to constrain the maximum amount of padding, before + // SafeSPrintf() detects an error in the format string. + ScopedSafeSPrintfSSizeMaxSetter ssize_max_setter(100); + char buf[256]; + EXPECT_EQ(99, SafeSPrintf(buf, "%99c", ' ')); + EXPECT_EQ(std::string(99, ' '), std::string(buf)); + *buf = '\000'; +#if defined(ALLOW_DEATH_TEST) + EXPECT_DEATH(SafeSPrintf(buf, "%100c", ' '), "padding <= max_padding"); +#endif + EXPECT_EQ(0, *buf); +#endif +} + +TEST(SafeSPrintfTest, Padding) { + char buf[40], fmt[40]; + + // Chars %c + EXPECT_EQ(1, SafeSPrintf(buf, "%c", 'A')); + EXPECT_EQ("A", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%2c", 'A')); + EXPECT_EQ(" A", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%02c", 'A')); + EXPECT_EQ(" A", std::string(buf)); + EXPECT_EQ(4, SafeSPrintf(buf, "%-2c", 'A')); + EXPECT_EQ("%-2c", std::string(buf)); + SafeSPrintf(fmt, "%%%dc", std::numeric_limits<ssize_t>::max() - 1); + EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1, SafeSPrintf(buf, fmt, 'A')); + SafeSPrintf(fmt, "%%%dc", + static_cast<size_t>(std::numeric_limits<ssize_t>::max())); +#if defined(NDEBUG) + EXPECT_EQ(2, SafeSPrintf(buf, fmt, 'A')); + EXPECT_EQ("%c", std::string(buf)); +#elif defined(ALLOW_DEATH_TEST) + EXPECT_DEATH(SafeSPrintf(buf, fmt, 'A'), "padding <= max_padding"); +#endif + + // Octal %o + EXPECT_EQ(1, SafeSPrintf(buf, "%o", 1)); + EXPECT_EQ("1", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%2o", 1)); + EXPECT_EQ(" 1", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%02o", 1)); + EXPECT_EQ("01", std::string(buf)); + EXPECT_EQ(12, SafeSPrintf(buf, "%12o", -1)); + EXPECT_EQ(" 37777777777", std::string(buf)); + EXPECT_EQ(12, SafeSPrintf(buf, "%012o", -1)); + EXPECT_EQ("037777777777", std::string(buf)); + EXPECT_EQ(23, SafeSPrintf(buf, "%23o", -1LL)); + EXPECT_EQ(" 1777777777777777777777", std::string(buf)); + EXPECT_EQ(23, SafeSPrintf(buf, "%023o", -1LL)); + EXPECT_EQ("01777777777777777777777", std::string(buf)); + EXPECT_EQ(3, SafeSPrintf(buf, "%2o", 0111)); + EXPECT_EQ("111", std::string(buf)); + EXPECT_EQ(4, SafeSPrintf(buf, "%-2o", 1)); + EXPECT_EQ("%-2o", std::string(buf)); + SafeSPrintf(fmt, "%%%do", std::numeric_limits<ssize_t>::max()-1); + EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1, + SafeSNPrintf(buf, 4, fmt, 1)); + EXPECT_EQ(" ", std::string(buf)); + SafeSPrintf(fmt, "%%0%do", std::numeric_limits<ssize_t>::max()-1); + EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1, + SafeSNPrintf(buf, 4, fmt, 1)); + EXPECT_EQ("000", std::string(buf)); + SafeSPrintf(fmt, "%%%do", + static_cast<size_t>(std::numeric_limits<ssize_t>::max())); +#if defined(NDEBUG) + EXPECT_EQ(2, SafeSPrintf(buf, fmt, 1)); + EXPECT_EQ("%o", std::string(buf)); +#elif defined(ALLOW_DEATH_TEST) + EXPECT_DEATH(SafeSPrintf(buf, fmt, 1), "padding <= max_padding"); +#endif + + // Decimals %d + EXPECT_EQ(1, SafeSPrintf(buf, "%d", 1)); + EXPECT_EQ("1", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%2d", 1)); + EXPECT_EQ(" 1", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%02d", 1)); + EXPECT_EQ("01", std::string(buf)); + EXPECT_EQ(3, SafeSPrintf(buf, "%3d", -1)); + EXPECT_EQ(" -1", std::string(buf)); + EXPECT_EQ(3, SafeSPrintf(buf, "%03d", -1)); + EXPECT_EQ("-01", std::string(buf)); + EXPECT_EQ(3, SafeSPrintf(buf, "%2d", 111)); + EXPECT_EQ("111", std::string(buf)); + EXPECT_EQ(4, SafeSPrintf(buf, "%2d", -111)); + EXPECT_EQ("-111", std::string(buf)); + EXPECT_EQ(4, SafeSPrintf(buf, "%-2d", 1)); + EXPECT_EQ("%-2d", std::string(buf)); + SafeSPrintf(fmt, "%%%dd", std::numeric_limits<ssize_t>::max()-1); + EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1, + SafeSNPrintf(buf, 4, fmt, 1)); + EXPECT_EQ(" ", std::string(buf)); + SafeSPrintf(fmt, "%%0%dd", std::numeric_limits<ssize_t>::max()-1); + EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1, + SafeSNPrintf(buf, 4, fmt, 1)); + EXPECT_EQ("000", std::string(buf)); + SafeSPrintf(fmt, "%%%dd", + static_cast<size_t>(std::numeric_limits<ssize_t>::max())); +#if defined(NDEBUG) + EXPECT_EQ(2, SafeSPrintf(buf, fmt, 1)); + EXPECT_EQ("%d", std::string(buf)); +#elif defined(ALLOW_DEATH_TEST) + EXPECT_DEATH(SafeSPrintf(buf, fmt, 1), "padding <= max_padding"); +#endif + + // Hex %X + EXPECT_EQ(1, SafeSPrintf(buf, "%X", 1)); + EXPECT_EQ("1", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%2X", 1)); + EXPECT_EQ(" 1", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%02X", 1)); + EXPECT_EQ("01", std::string(buf)); + EXPECT_EQ(9, SafeSPrintf(buf, "%9X", -1)); + EXPECT_EQ(" FFFFFFFF", std::string(buf)); + EXPECT_EQ(9, SafeSPrintf(buf, "%09X", -1)); + EXPECT_EQ("0FFFFFFFF", std::string(buf)); + EXPECT_EQ(17, SafeSPrintf(buf, "%17X", -1LL)); + EXPECT_EQ(" FFFFFFFFFFFFFFFF", std::string(buf)); + EXPECT_EQ(17, SafeSPrintf(buf, "%017X", -1LL)); + EXPECT_EQ("0FFFFFFFFFFFFFFFF", std::string(buf)); + EXPECT_EQ(3, SafeSPrintf(buf, "%2X", 0x111)); + EXPECT_EQ("111", std::string(buf)); + EXPECT_EQ(4, SafeSPrintf(buf, "%-2X", 1)); + EXPECT_EQ("%-2X", std::string(buf)); + SafeSPrintf(fmt, "%%%dX", std::numeric_limits<ssize_t>::max()-1); + EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1, + SafeSNPrintf(buf, 4, fmt, 1)); + EXPECT_EQ(" ", std::string(buf)); + SafeSPrintf(fmt, "%%0%dX", std::numeric_limits<ssize_t>::max()-1); + EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1, + SafeSNPrintf(buf, 4, fmt, 1)); + EXPECT_EQ("000", std::string(buf)); + SafeSPrintf(fmt, "%%%dX", + static_cast<size_t>(std::numeric_limits<ssize_t>::max())); +#if defined(NDEBUG) + EXPECT_EQ(2, SafeSPrintf(buf, fmt, 1)); + EXPECT_EQ("%X", std::string(buf)); +#elif defined(ALLOW_DEATH_TEST) + EXPECT_DEATH(SafeSPrintf(buf, fmt, 1), "padding <= max_padding"); +#endif + + // Pointer %p + EXPECT_EQ(3, SafeSPrintf(buf, "%p", (void*)1)); + EXPECT_EQ("0x1", std::string(buf)); + EXPECT_EQ(4, SafeSPrintf(buf, "%4p", (void*)1)); + EXPECT_EQ(" 0x1", std::string(buf)); + EXPECT_EQ(4, SafeSPrintf(buf, "%04p", (void*)1)); + EXPECT_EQ("0x01", std::string(buf)); + EXPECT_EQ(5, SafeSPrintf(buf, "%4p", (void*)0x111)); + EXPECT_EQ("0x111", std::string(buf)); + EXPECT_EQ(4, SafeSPrintf(buf, "%-2p", (void*)1)); + EXPECT_EQ("%-2p", std::string(buf)); + SafeSPrintf(fmt, "%%%dp", std::numeric_limits<ssize_t>::max()-1); + EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1, + SafeSNPrintf(buf, 4, fmt, (void*)1)); + EXPECT_EQ(" ", std::string(buf)); + SafeSPrintf(fmt, "%%0%dp", std::numeric_limits<ssize_t>::max()-1); + EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1, + SafeSNPrintf(buf, 4, fmt, (void*)1)); + EXPECT_EQ("0x0", std::string(buf)); + SafeSPrintf(fmt, "%%%dp", + static_cast<size_t>(std::numeric_limits<ssize_t>::max())); +#if defined(NDEBUG) + EXPECT_EQ(2, SafeSPrintf(buf, fmt, 1)); + EXPECT_EQ("%p", std::string(buf)); +#elif defined(ALLOW_DEATH_TEST) + EXPECT_DEATH(SafeSPrintf(buf, fmt, 1), "padding <= max_padding"); +#endif + + // String + EXPECT_EQ(1, SafeSPrintf(buf, "%s", "A")); + EXPECT_EQ("A", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%2s", "A")); + EXPECT_EQ(" A", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%02s", "A")); + EXPECT_EQ(" A", std::string(buf)); + EXPECT_EQ(3, SafeSPrintf(buf, "%2s", "AAA")); + EXPECT_EQ("AAA", std::string(buf)); + EXPECT_EQ(4, SafeSPrintf(buf, "%-2s", "A")); + EXPECT_EQ("%-2s", std::string(buf)); + SafeSPrintf(fmt, "%%%ds", std::numeric_limits<ssize_t>::max()-1); + EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1, + SafeSNPrintf(buf, 4, fmt, "A")); + EXPECT_EQ(" ", std::string(buf)); + SafeSPrintf(fmt, "%%0%ds", std::numeric_limits<ssize_t>::max()-1); + EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1, + SafeSNPrintf(buf, 4, fmt, "A")); + EXPECT_EQ(" ", std::string(buf)); + SafeSPrintf(fmt, "%%%ds", + static_cast<size_t>(std::numeric_limits<ssize_t>::max())); +#if defined(NDEBUG) + EXPECT_EQ(2, SafeSPrintf(buf, fmt, "A")); + EXPECT_EQ("%s", std::string(buf)); +#elif defined(ALLOW_DEATH_TEST) + EXPECT_DEATH(SafeSPrintf(buf, fmt, "A"), "padding <= max_padding"); +#endif +} + +TEST(SafeSPrintfTest, EmbeddedNul) { + char buf[] = { 'X', 'X', 'X', 'X' }; + EXPECT_EQ(2, SafeSPrintf(buf, "%3c", 0)); + EXPECT_EQ(' ', buf[0]); + EXPECT_EQ(' ', buf[1]); + EXPECT_EQ(0, buf[2]); + EXPECT_EQ('X', buf[3]); + + // Check handling of a NUL format character. N.B. this takes two different + // code paths depending on whether we are actually passing arguments. If + // we don't have any arguments, we are running in the fast-path code, that + // looks (almost) like a strncpy(). +#if defined(NDEBUG) + EXPECT_EQ(2, SafeSPrintf(buf, "%%%")); + EXPECT_EQ("%%", std::string(buf)); + EXPECT_EQ(2, SafeSPrintf(buf, "%%%", 0)); + EXPECT_EQ("%%", std::string(buf)); +#elif defined(ALLOW_DEATH_TEST) + EXPECT_DEATH(SafeSPrintf(buf, "%%%"), "src.1. == '%'"); + EXPECT_DEATH(SafeSPrintf(buf, "%%%", 0), "ch"); +#endif +} + +TEST(SafeSPrintfTest, EmitNULL) { + char buf[40]; +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion-null" +#endif + EXPECT_EQ(1, SafeSPrintf(buf, "%d", NULL)); + EXPECT_EQ("0", std::string(buf)); + EXPECT_EQ(3, SafeSPrintf(buf, "%p", NULL)); + EXPECT_EQ("0x0", std::string(buf)); + EXPECT_EQ(6, SafeSPrintf(buf, "%s", NULL)); + EXPECT_EQ("<NULL>", std::string(buf)); +#if defined(__GCC__) +#pragma GCC diagnostic pop +#endif +} + +TEST(SafeSPrintfTest, PointerSize) { + // The internal data representation is a 64bit value, independent of the + // native word size. We want to perform sign-extension for signed integers, + // but we want to avoid doing so for pointer types. This could be a + // problem on systems, where pointers are only 32bit. This tests verifies + // that there is no such problem. + char *str = reinterpret_cast<char *>(0x80000000u); + void *ptr = str; + char buf[40]; + EXPECT_EQ(10, SafeSPrintf(buf, "%p", str)); + EXPECT_EQ("0x80000000", std::string(buf)); + EXPECT_EQ(10, SafeSPrintf(buf, "%p", ptr)); + EXPECT_EQ("0x80000000", std::string(buf)); +} + +} // namespace strings +} // namespace base
diff --git a/base/strings/strcat.cc b/base/strings/strcat.cc new file mode 100644 index 0000000..1774a15 --- /dev/null +++ b/base/strings/strcat.cc
@@ -0,0 +1,81 @@ +// Copyright 2017 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/strcat.h" + +namespace gurl_base { + +namespace { + +// Reserves an additional amount of size in the given string, growing by at +// least 2x. Used by StrAppend(). +// +// The "at least 2x" growing rule duplicates the exponential growth of +// std::string. The problem is that most implementations of reserve() will grow +// exactly to the requested amount instead of exponentially growing like would +// happen when appending normally. If we didn't do this, an append after the +// call to StrAppend() would definitely cause a reallocation, and loops with +// StrAppend() calls would have O(n^2) complexity to execute. Instead, we want +// StrAppend() to have the same semantics as std::string::append(). +// +// If the string is empty, we assume that exponential growth is not necessary. +template <typename String> +void ReserveAdditional(String* str, typename String::size_type additional) { + str->reserve(std::max(str->size() + additional, str->size() * 2)); +} + +template <typename DestString, typename InputString> +void StrAppendT(DestString* dest, span<const InputString> pieces) { + size_t additional_size = 0; + for (const auto& cur : pieces) + additional_size += cur.size(); + ReserveAdditional(dest, additional_size); + + for (const auto& cur : pieces) + dest->append(cur.data(), cur.size()); +} + +} // namespace + +std::string StrCat(span<const StringPiece> pieces) { + std::string result; + StrAppendT(&result, pieces); + return result; +} + +string16 StrCat(span<const StringPiece16> pieces) { + string16 result; + StrAppendT(&result, pieces); + return result; +} + +std::string StrCat(span<const std::string> pieces) { + std::string result; + StrAppendT(&result, pieces); + return result; +} + +string16 StrCat(span<const string16> pieces) { + string16 result; + StrAppendT(&result, pieces); + return result; +} + +void StrAppend(std::string* dest, span<const StringPiece> pieces) { + StrAppendT(dest, pieces); +} + +void StrAppend(string16* dest, span<const StringPiece16> pieces) { + StrAppendT(dest, pieces); +} + +void StrAppend(std::string* dest, span<const std::string> pieces) { + StrAppendT(dest, pieces); +} + +void StrAppend(string16* dest, span<const string16> pieces) { + StrAppendT(dest, pieces); +} + +} // namespace base
diff --git a/base/strings/strcat.h b/base/strings/strcat.h new file mode 100644 index 0000000..bcdfe17 --- /dev/null +++ b/base/strings/strcat.h
@@ -0,0 +1,100 @@ +// Copyright 2017 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_STRCAT_H_ +#define BASE_STRINGS_STRCAT_H_ + +#include <initializer_list> + +#include "polyfills/base/base_export.h" +#include "base/compiler_specific.h" +#include "base/containers/span.h" +#include "base/strings/string_piece.h" +#include "build/build_config.h" + +#if defined(OS_WIN) +// Guard against conflict with Win32 API StrCat macro: +// check StrCat wasn't and will not be redefined. +#define StrCat StrCat +#endif + +namespace gurl_base { + +// StrCat ---------------------------------------------------------------------- +// +// StrCat is a function to perform concatenation on a sequence of strings. +// It is preferrable to a sequence of "a + b + c" because it is both faster and +// generates less code. +// +// std::string result = gurl_base::StrCat({"foo ", result, "\nfoo ", bar}); +// +// To join an array of strings with a separator, see gurl_base::JoinString in +// base/strings/string_util.h. +// +// MORE INFO +// +// StrCat can see all arguments at once, so it can allocate one return buffer +// of exactly the right size and copy once, as opposed to a sequence of +// operator+ which generates a series of temporary strings, copying as it goes. +// And by using StringPiece arguments, StrCat can avoid creating temporary +// string objects for char* constants. +// +// ALTERNATIVES +// +// Internal Google / Abseil has a similar StrCat function. That version takes +// an overloaded number of arguments instead of initializer list (overflowing +// to initializer list for many arguments). We don't have any legacy +// requirements and using only initializer_list is simpler and generates +// roughly the same amount of code at the call sites. +// +// Abseil's StrCat also allows numbers by using an intermediate class that can +// be implicitly constructed from either a string or various number types. This +// class formats the numbers into a static buffer for increased performance, +// and the call sites look nice. +// +// As-written Abseil's helper class for numbers generates slightly more code +// than the raw StringPiece version. We can de-inline the helper class' +// constructors which will cause the StringPiece constructors to be de-inlined +// for this call and generate slightly less code. This is something we can +// explore more in the future. + +BASE_EXPORT std::string StrCat(span<const StringPiece> pieces); +BASE_EXPORT string16 StrCat(span<const StringPiece16> pieces); +BASE_EXPORT std::string StrCat(span<const std::string> pieces); +BASE_EXPORT string16 StrCat(span<const string16> pieces); + +// Initializer list forwards to the array version. +inline std::string StrCat(std::initializer_list<StringPiece> pieces) { + return StrCat(make_span(pieces.begin(), pieces.size())); +} +inline string16 StrCat(std::initializer_list<StringPiece16> pieces) { + return StrCat(make_span(pieces.begin(), pieces.size())); +} + +// StrAppend ------------------------------------------------------------------- +// +// Appends a sequence of strings to a destination. Prefer: +// StrAppend(&foo, ...); +// over: +// foo += StrCat(...); +// because it avoids a temporary string allocation and copy. + +BASE_EXPORT void StrAppend(std::string* dest, span<const StringPiece> pieces); +BASE_EXPORT void StrAppend(string16* dest, span<const StringPiece16> pieces); +BASE_EXPORT void StrAppend(std::string* dest, span<const std::string> pieces); +BASE_EXPORT void StrAppend(string16* dest, span<const string16> pieces); + +// Initializer list forwards to the array version. +inline void StrAppend(std::string* dest, + std::initializer_list<StringPiece> pieces) { + return StrAppend(dest, make_span(pieces.begin(), pieces.size())); +} +inline void StrAppend(string16* dest, + std::initializer_list<StringPiece16> pieces) { + return StrAppend(dest, make_span(pieces.begin(), pieces.size())); +} + +} // namespace base + +#endif // BASE_STRINGS_STRCAT_H_
diff --git a/base/strings/strcat_unittest.cc b/base/strings/strcat_unittest.cc new file mode 100644 index 0000000..d51b840 --- /dev/null +++ b/base/strings/strcat_unittest.cc
@@ -0,0 +1,67 @@ +// Copyright 2017 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/strcat.h" +#include "base/strings/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace gurl_base { + +TEST(StrCat, 8Bit) { + EXPECT_EQ("", StrCat({""})); + EXPECT_EQ("1", StrCat({"1"})); + EXPECT_EQ("122", StrCat({"1", "22"})); + EXPECT_EQ("122333", StrCat({"1", "22", "333"})); + EXPECT_EQ("1223334444", StrCat({"1", "22", "333", "4444"})); + EXPECT_EQ("122333444455555", StrCat({"1", "22", "333", "4444", "55555"})); +} + +TEST(StrCat, 16Bit) { + string16 arg1 = ASCIIToUTF16("1"); + string16 arg2 = ASCIIToUTF16("22"); + string16 arg3 = ASCIIToUTF16("333"); + + EXPECT_EQ(ASCIIToUTF16(""), StrCat({string16()})); + EXPECT_EQ(ASCIIToUTF16("1"), StrCat({arg1})); + EXPECT_EQ(ASCIIToUTF16("122"), StrCat({arg1, arg2})); + EXPECT_EQ(ASCIIToUTF16("122333"), StrCat({arg1, arg2, arg3})); +} + +TEST(StrAppend, 8Bit) { + std::string result; + + result = "foo"; + StrAppend(&result, {std::string()}); + EXPECT_EQ("foo", result); + + result = "foo"; + StrAppend(&result, {"1"}); + EXPECT_EQ("foo1", result); + + result = "foo"; + StrAppend(&result, {"1", "22", "333"}); + EXPECT_EQ("foo122333", result); +} + +TEST(StrAppend, 16Bit) { + string16 arg1 = ASCIIToUTF16("1"); + string16 arg2 = ASCIIToUTF16("22"); + string16 arg3 = ASCIIToUTF16("333"); + + string16 result; + + result = ASCIIToUTF16("foo"); + StrAppend(&result, {string16()}); + EXPECT_EQ(ASCIIToUTF16("foo"), result); + + result = ASCIIToUTF16("foo"); + StrAppend(&result, {arg1}); + EXPECT_EQ(ASCIIToUTF16("foo1"), result); + + result = ASCIIToUTF16("foo"); + StrAppend(&result, {arg1, arg2, arg3}); + EXPECT_EQ(ASCIIToUTF16("foo122333"), result); +} + +} // namespace base
diff --git a/base/strings/string16.cc b/base/strings/string16.cc new file mode 100644 index 0000000..6ac8b8b --- /dev/null +++ b/base/strings/string16.cc
@@ -0,0 +1,87 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string16.h" + +#if defined(WCHAR_T_IS_UTF16) && !defined(_AIX) + +#error This file should not be used on 2-byte wchar_t systems +// If this winds up being needed on 2-byte wchar_t systems, either the +// definitions below can be used, or the host system's wide character +// functions like wmemcmp can be wrapped. + +#elif defined(WCHAR_T_IS_UTF32) + +#include <ostream> + +#include "base/strings/string_piece.h" + +namespace gurl_base { + +int c16memcmp(const char16* s1, const char16* s2, size_t n) { + // We cannot call memcmp because that changes the semantics. + while (n-- > 0) { + if (*s1 != *s2) { + // We cannot use (*s1 - *s2) because char16 is unsigned. + return ((*s1 < *s2) ? -1 : 1); + } + ++s1; + ++s2; + } + return 0; +} + +size_t c16len(const char16* s) { + const char16 *s_orig = s; + while (*s) { + ++s; + } + return s - s_orig; +} + +const char16* c16memchr(const char16* s, char16 c, size_t n) { + while (n-- > 0) { + if (*s == c) { + return s; + } + ++s; + } + return nullptr; +} + +char16* c16memmove(char16* s1, const char16* s2, size_t n) { + return static_cast<char16*>(memmove(s1, s2, n * sizeof(char16))); +} + +char16* c16memcpy(char16* s1, const char16* s2, size_t n) { + return static_cast<char16*>(memcpy(s1, s2, n * sizeof(char16))); +} + +char16* c16memset(char16* s, char16 c, size_t n) { + char16 *s_orig = s; + while (n-- > 0) { + *s = c; + ++s; + } + return s_orig; +} + +namespace string16_internals { + +std::ostream& operator<<(std::ostream& out, const string16& str) { + return out << gurl_base::StringPiece16(str); +} + +void PrintTo(const string16& str, std::ostream* out) { + *out << str; +} + +} // namespace string16_internals + +} // namespace base + +template class std:: + basic_string<gurl_base::char16, gurl_base::string16_internals::string16_char_traits>; + +#endif // WCHAR_T_IS_UTF32
diff --git a/base/strings/string16.h b/base/strings/string16.h new file mode 100644 index 0000000..f17a57f --- /dev/null +++ b/base/strings/string16.h
@@ -0,0 +1,229 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_STRING16_H_ +#define BASE_STRINGS_STRING16_H_ + +// WHAT: +// A version of std::basic_string that provides 2-byte characters even when +// wchar_t is not implemented as a 2-byte type. You can access this class as +// string16. We also define char16, which string16 is based upon. +// +// WHY: +// On Windows, wchar_t is 2 bytes, and it can conveniently handle UTF-16/UCS-2 +// data. Plenty of existing code operates on strings encoded as UTF-16. +// +// On many other platforms, sizeof(wchar_t) is 4 bytes by default. We can make +// it 2 bytes by using the GCC flag -fshort-wchar. But then std::wstring fails +// at run time, because it calls some functions (like wcslen) that come from +// the system's native C library -- which was built with a 4-byte wchar_t! +// It's wasteful to use 4-byte wchar_t strings to carry UTF-16 data, and it's +// entirely improper on those systems where the encoding of wchar_t is defined +// as UTF-32. +// +// Here, we define string16, which is similar to std::wstring but replaces all +// libc functions with custom, 2-byte-char compatible routines. It is capable +// of carrying UTF-16-encoded data. + +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> + +#include <functional> +#include <string> + +#include "polyfills/base/base_export.h" +#include "build/build_config.h" + +#if defined(WCHAR_T_IS_UTF16) + +// Define a macro for wrapping construction of char16 arrays and string16s from +// a literal string. This indirection allows for an easier migration of +// gurl_base::char16 to char16_t on platforms where WCHAR_T_IS_UTF16, as only a one +// character change to the macro will be necessary. +// This macro does not exist when WCHAR_T_IS_UTF32, as it is currently not +// possible to create a char array form a literal in this case. +// TODO(https://crbug.com/911896): Remove this macro once gurl_base::char16 is +// char16_t on all platforms. +#define STRING16_LITERAL(x) L##x + +namespace gurl_base { + +typedef wchar_t char16; +typedef std::wstring string16; + +} // namespace base + +#elif defined(WCHAR_T_IS_UTF32) + +#include <wchar.h> // for mbstate_t + +namespace gurl_base { + +typedef uint16_t char16; + +// char16 versions of the functions required by string16_char_traits; these +// are based on the wide character functions of similar names ("w" or "wcs" +// instead of "c16"). +BASE_EXPORT int c16memcmp(const char16* s1, const char16* s2, size_t n); +BASE_EXPORT size_t c16len(const char16* s); +BASE_EXPORT const char16* c16memchr(const char16* s, char16 c, size_t n); +BASE_EXPORT char16* c16memmove(char16* s1, const char16* s2, size_t n); +BASE_EXPORT char16* c16memcpy(char16* s1, const char16* s2, size_t n); +BASE_EXPORT char16* c16memset(char16* s, char16 c, size_t n); + +// This namespace contains the implementation of gurl_base::string16 along with +// things that need to be found via argument-dependent lookup from a +// gurl_base::string16. +namespace string16_internals { + +struct string16_char_traits { + typedef char16 char_type; + typedef int int_type; + + // int_type needs to be able to hold each possible value of char_type, and in + // addition, the distinct value of eof(). + static_assert(sizeof(int_type) > sizeof(char_type), + "int must be larger than 16 bits wide"); + + typedef std::streamoff off_type; + typedef mbstate_t state_type; + typedef std::fpos<state_type> pos_type; + + static void assign(char_type& c1, const char_type& c2) { + c1 = c2; + } + + static bool eq(const char_type& c1, const char_type& c2) { + return c1 == c2; + } + static bool lt(const char_type& c1, const char_type& c2) { + return c1 < c2; + } + + static int compare(const char_type* s1, const char_type* s2, size_t n) { + return c16memcmp(s1, s2, n); + } + + static size_t length(const char_type* s) { + return c16len(s); + } + + static const char_type* find(const char_type* s, size_t n, + const char_type& a) { + return c16memchr(s, a, n); + } + + static char_type* move(char_type* s1, const char_type* s2, size_t n) { + return c16memmove(s1, s2, n); + } + + static char_type* copy(char_type* s1, const char_type* s2, size_t n) { + return c16memcpy(s1, s2, n); + } + + static char_type* assign(char_type* s, size_t n, char_type a) { + return c16memset(s, a, n); + } + + static int_type not_eof(const int_type& c) { + return eq_int_type(c, eof()) ? 0 : c; + } + + static char_type to_char_type(const int_type& c) { + return char_type(c); + } + + static int_type to_int_type(const char_type& c) { + return int_type(c); + } + + static bool eq_int_type(const int_type& c1, const int_type& c2) { + return c1 == c2; + } + + static int_type eof() { + return static_cast<int_type>(EOF); + } +}; + +} // namespace string16_internals + +typedef std::basic_string<char16, + gurl_base::string16_internals::string16_char_traits> + string16; + +namespace string16_internals { + +BASE_EXPORT extern std::ostream& operator<<(std::ostream& out, + const string16& str); + +// This is required by googletest to print a readable output on test failures. +BASE_EXPORT extern void PrintTo(const string16& str, std::ostream* out); + +} // namespace string16_internals + +} // namespace base + +// The string class will be explicitly instantiated only once, in string16.cc. +// +// std::basic_string<> in GNU libstdc++ contains a static data member, +// _S_empty_rep_storage, to represent empty strings. When an operation such +// as assignment or destruction is performed on a string, causing its existing +// data member to be invalidated, it must not be freed if this static data +// member is being used. Otherwise, it counts as an attempt to free static +// (and not allocated) data, which is a memory error. +// +// Generally, due to C++ template magic, _S_empty_rep_storage will be marked +// as a coalesced symbol, meaning that the linker will combine multiple +// instances into a single one when generating output. +// +// If a string class is used by multiple shared libraries, a problem occurs. +// Each library will get its own copy of _S_empty_rep_storage. When strings +// are passed across a library boundary for alteration or destruction, memory +// errors will result. GNU libstdc++ contains a configuration option, +// --enable-fully-dynamic-string (_GLIBCXX_FULLY_DYNAMIC_STRING), which +// disables the static data member optimization, but it's a good optimization +// and non-STL code is generally at the mercy of the system's STL +// configuration. Fully-dynamic strings are not the default for GNU libstdc++ +// libstdc++ itself or for the libstdc++ installations on the systems we care +// about, such as Mac OS X and relevant flavors of Linux. +// +// See also http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24196 . +// +// To avoid problems, string classes need to be explicitly instantiated only +// once, in exactly one library. All other string users see it via an "extern" +// declaration. This is precisely how GNU libstdc++ handles +// std::basic_string<char> (string) and std::basic_string<wchar_t> (wstring). +// +// This also works around a Mac OS X linker bug in ld64-85.2.1 (Xcode 3.1.2), +// in which the linker does not fully coalesce symbols when dead code +// stripping is enabled. This bug causes the memory errors described above +// to occur even when a std::basic_string<> does not cross shared library +// boundaries, such as in statically-linked executables. +// +// TODO(mark): File this bug with Apple and update this note with a bug number. + +extern template class BASE_EXPORT + std::basic_string<gurl_base::char16, + gurl_base::string16_internals::string16_char_traits>; + +// Specialize std::hash for gurl_base::string16. Although the style guide forbids +// this in general, it is necessary for consistency with WCHAR_T_IS_UTF16 +// platforms, where gurl_base::string16 is a type alias for std::wstring. +namespace std { +template <> +struct hash<gurl_base::string16> { + std::size_t operator()(const gurl_base::string16& s) const { + std::size_t result = 0; + for (gurl_base::char16 c : s) + result = (result * 131) + c; + return result; + } +}; +} // namespace std + +#endif // WCHAR_T_IS_UTF32 + +#endif // BASE_STRINGS_STRING16_H_
diff --git a/base/strings/string16_unittest.cc b/base/strings/string16_unittest.cc new file mode 100644 index 0000000..a9aecef --- /dev/null +++ b/base/strings/string16_unittest.cc
@@ -0,0 +1,75 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <sstream> +#include <unordered_set> + +#include "base/strings/string16.h" + +#include "base/strings/utf_string_conversions.h" +#include "build/build_config.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace gurl_base { + +#if defined(WCHAR_T_IS_UTF16) +TEST(String16Test, String16Literal) { + static constexpr char16 kHelloWorld[] = STRING16_LITERAL("Hello, World"); + string16 hello_world = kHelloWorld; + EXPECT_EQ(kHelloWorld, hello_world); +} +#endif + +// We define a custom operator<< for string16 so we can use it with logging. +// This tests that conversion. +TEST(String16Test, OutputStream) { + // Basic stream test. + { + std::ostringstream stream; + stream << "Empty '" << string16() << "' standard '" + << string16(ASCIIToUTF16("Hello, world")) << "'"; + EXPECT_STREQ("Empty '' standard 'Hello, world'", + stream.str().c_str()); + } + + // Interesting edge cases. + { + // These should each get converted to the invalid character: EF BF BD. + string16 initial_surrogate; + initial_surrogate.push_back(0xd800); + string16 final_surrogate; + final_surrogate.push_back(0xdc00); + + // Old italic A = U+10300, will get converted to: F0 90 8C 80 'z'. + string16 surrogate_pair; + surrogate_pair.push_back(0xd800); + surrogate_pair.push_back(0xdf00); + surrogate_pair.push_back('z'); + + // Will get converted to the invalid char + 's': EF BF BD 's'. + string16 unterminated_surrogate; + unterminated_surrogate.push_back(0xd800); + unterminated_surrogate.push_back('s'); + + std::ostringstream stream; + stream << initial_surrogate << "," << final_surrogate << "," + << surrogate_pair << "," << unterminated_surrogate; + + EXPECT_STREQ("\xef\xbf\xbd,\xef\xbf\xbd,\xf0\x90\x8c\x80z,\xef\xbf\xbds", + stream.str().c_str()); + } +} + +TEST(String16Test, Hash) { + string16 str1 = ASCIIToUTF16("hello"); + string16 str2 = ASCIIToUTF16("world"); + + std::unordered_set<string16> set; + + set.insert(str1); + EXPECT_EQ(1u, set.count(str1)); + EXPECT_EQ(0u, set.count(str2)); +} + +} // namespace base
diff --git a/base/strings/string_number_conversions.cc b/base/strings/string_number_conversions.cc new file mode 100644 index 0000000..2bf6142 --- /dev/null +++ b/base/strings/string_number_conversions.cc
@@ -0,0 +1,505 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string_number_conversions.h" + +#include <ctype.h> +#include <errno.h> +#include <stdlib.h> +#include <wctype.h> + +#include <limits> +#include <type_traits> + +#include "polyfills/base/logging.h" +#include "base/numerics/safe_math.h" +#include "base/scoped_clear_last_error.h" +#include "base/strings/utf_string_conversions.h" +#include "base/third_party/dmg_fp/dmg_fp.h" + +namespace gurl_base { + +namespace { + +template <typename STR, typename INT> +struct IntToStringT { + static STR IntToString(INT value) { + // log10(2) ~= 0.3 bytes needed per bit or per byte log10(2**8) ~= 2.4. + // So round up to allocate 3 output characters per byte, plus 1 for '-'. + const size_t kOutputBufSize = + 3 * sizeof(INT) + std::numeric_limits<INT>::is_signed; + + // Create the string in a temporary buffer, write it back to front, and + // then return the substr of what we ended up using. + using CHR = typename STR::value_type; + CHR outbuf[kOutputBufSize]; + + // The ValueOrDie call below can never fail, because UnsignedAbs is valid + // for all valid inputs. + typename std::make_unsigned<INT>::type res = + CheckedNumeric<INT>(value).UnsignedAbs().ValueOrDie(); + + CHR* end = outbuf + kOutputBufSize; + CHR* i = end; + do { + --i; + GURL_DCHECK(i != outbuf); + *i = static_cast<CHR>((res % 10) + '0'); + res /= 10; + } while (res != 0); + if (IsValueNegative(value)) { + --i; + GURL_DCHECK(i != outbuf); + *i = static_cast<CHR>('-'); + } + return STR(i, end); + } +}; + +// Utility to convert a character to a digit in a given base +template<typename CHAR, int BASE, bool BASE_LTE_10> class BaseCharToDigit { +}; + +// Faster specialization for bases <= 10 +template<typename CHAR, int BASE> class BaseCharToDigit<CHAR, BASE, true> { + public: + static bool Convert(CHAR c, uint8_t* digit) { + if (c >= '0' && c < '0' + BASE) { + *digit = static_cast<uint8_t>(c - '0'); + return true; + } + return false; + } +}; + +// Specialization for bases where 10 < base <= 36 +template<typename CHAR, int BASE> class BaseCharToDigit<CHAR, BASE, false> { + public: + static bool Convert(CHAR c, uint8_t* digit) { + if (c >= '0' && c <= '9') { + *digit = c - '0'; + } else if (c >= 'a' && c < 'a' + BASE - 10) { + *digit = c - 'a' + 10; + } else if (c >= 'A' && c < 'A' + BASE - 10) { + *digit = c - 'A' + 10; + } else { + return false; + } + return true; + } +}; + +template <int BASE, typename CHAR> +bool CharToDigit(CHAR c, uint8_t* digit) { + return BaseCharToDigit<CHAR, BASE, BASE <= 10>::Convert(c, digit); +} + +// There is an IsUnicodeWhitespace for wchars defined in string_util.h, but it +// is locale independent, whereas the functions we are replacing were +// locale-dependent. TBD what is desired, but for the moment let's not +// introduce a change in behaviour. +template<typename CHAR> class WhitespaceHelper { +}; + +template<> class WhitespaceHelper<char> { + public: + static bool Invoke(char c) { + return 0 != isspace(static_cast<unsigned char>(c)); + } +}; + +template<> class WhitespaceHelper<char16> { + public: + static bool Invoke(char16 c) { + return 0 != iswspace(c); + } +}; + +template<typename CHAR> bool LocalIsWhitespace(CHAR c) { + return WhitespaceHelper<CHAR>::Invoke(c); +} + +// IteratorRangeToNumberTraits should provide: +// - a typedef for iterator_type, the iterator type used as input. +// - a typedef for value_type, the target numeric type. +// - static functions min, max (returning the minimum and maximum permitted +// values) +// - constant kBase, the base in which to interpret the input +template<typename IteratorRangeToNumberTraits> +class IteratorRangeToNumber { + public: + typedef IteratorRangeToNumberTraits traits; + typedef typename traits::iterator_type const_iterator; + typedef typename traits::value_type value_type; + + // Generalized iterator-range-to-number conversion. + // + static bool Invoke(const_iterator begin, + const_iterator end, + value_type* output) { + bool valid = true; + + while (begin != end && LocalIsWhitespace(*begin)) { + valid = false; + ++begin; + } + + if (begin != end && *begin == '-') { + if (!std::numeric_limits<value_type>::is_signed) { + *output = 0; + valid = false; + } else if (!Negative::Invoke(begin + 1, end, output)) { + valid = false; + } + } else { + if (begin != end && *begin == '+') { + ++begin; + } + if (!Positive::Invoke(begin, end, output)) { + valid = false; + } + } + + return valid; + } + + private: + // Sign provides: + // - a static function, CheckBounds, that determines whether the next digit + // causes an overflow/underflow + // - a static function, Increment, that appends the next digit appropriately + // according to the sign of the number being parsed. + template<typename Sign> + class Base { + public: + static bool Invoke(const_iterator begin, const_iterator end, + typename traits::value_type* output) { + *output = 0; + + if (begin == end) { + return false; + } + + // Note: no performance difference was found when using template + // specialization to remove this check in bases other than 16 + if (traits::kBase == 16 && end - begin > 2 && *begin == '0' && + (*(begin + 1) == 'x' || *(begin + 1) == 'X')) { + begin += 2; + } + + for (const_iterator current = begin; current != end; ++current) { + uint8_t new_digit = 0; + + if (!CharToDigit<traits::kBase>(*current, &new_digit)) { + return false; + } + + if (current != begin) { + if (!Sign::CheckBounds(output, new_digit)) { + return false; + } + *output *= traits::kBase; + } + + Sign::Increment(new_digit, output); + } + return true; + } + }; + + class Positive : public Base<Positive> { + public: + static bool CheckBounds(value_type* output, uint8_t new_digit) { + if (*output > static_cast<value_type>(traits::max() / traits::kBase) || + (*output == static_cast<value_type>(traits::max() / traits::kBase) && + new_digit > traits::max() % traits::kBase)) { + *output = traits::max(); + return false; + } + return true; + } + static void Increment(uint8_t increment, value_type* output) { + *output += increment; + } + }; + + class Negative : public Base<Negative> { + public: + static bool CheckBounds(value_type* output, uint8_t new_digit) { + if (*output < traits::min() / traits::kBase || + (*output == traits::min() / traits::kBase && + new_digit > 0 - traits::min() % traits::kBase)) { + *output = traits::min(); + return false; + } + return true; + } + static void Increment(uint8_t increment, value_type* output) { + *output -= increment; + } + }; +}; + +template<typename ITERATOR, typename VALUE, int BASE> +class BaseIteratorRangeToNumberTraits { + public: + typedef ITERATOR iterator_type; + typedef VALUE value_type; + static value_type min() { + return std::numeric_limits<value_type>::min(); + } + static value_type max() { + return std::numeric_limits<value_type>::max(); + } + static const int kBase = BASE; +}; + +template<typename ITERATOR> +class BaseHexIteratorRangeToIntTraits + : public BaseIteratorRangeToNumberTraits<ITERATOR, int, 16> { +}; + +template <typename ITERATOR> +class BaseHexIteratorRangeToUIntTraits + : public BaseIteratorRangeToNumberTraits<ITERATOR, uint32_t, 16> {}; + +template <typename ITERATOR> +class BaseHexIteratorRangeToInt64Traits + : public BaseIteratorRangeToNumberTraits<ITERATOR, int64_t, 16> {}; + +template <typename ITERATOR> +class BaseHexIteratorRangeToUInt64Traits + : public BaseIteratorRangeToNumberTraits<ITERATOR, uint64_t, 16> {}; + +typedef BaseHexIteratorRangeToIntTraits<StringPiece::const_iterator> + HexIteratorRangeToIntTraits; + +typedef BaseHexIteratorRangeToUIntTraits<StringPiece::const_iterator> + HexIteratorRangeToUIntTraits; + +typedef BaseHexIteratorRangeToInt64Traits<StringPiece::const_iterator> + HexIteratorRangeToInt64Traits; + +typedef BaseHexIteratorRangeToUInt64Traits<StringPiece::const_iterator> + HexIteratorRangeToUInt64Traits; + +template <typename VALUE, int BASE> +class StringPieceToNumberTraits + : public BaseIteratorRangeToNumberTraits<StringPiece::const_iterator, + VALUE, + BASE> { +}; + +template <typename VALUE> +bool StringToIntImpl(StringPiece input, VALUE* output) { + return IteratorRangeToNumber<StringPieceToNumberTraits<VALUE, 10> >::Invoke( + input.begin(), input.end(), output); +} + +template <typename VALUE, int BASE> +class StringPiece16ToNumberTraits + : public BaseIteratorRangeToNumberTraits<StringPiece16::const_iterator, + VALUE, + BASE> { +}; + +template <typename VALUE> +bool String16ToIntImpl(StringPiece16 input, VALUE* output) { + return IteratorRangeToNumber<StringPiece16ToNumberTraits<VALUE, 10> >::Invoke( + input.begin(), input.end(), output); +} + +} // namespace + +std::string NumberToString(int value) { + return IntToStringT<std::string, int>::IntToString(value); +} + +string16 NumberToString16(int value) { + return IntToStringT<string16, int>::IntToString(value); +} + +std::string NumberToString(unsigned value) { + return IntToStringT<std::string, unsigned>::IntToString(value); +} + +string16 NumberToString16(unsigned value) { + return IntToStringT<string16, unsigned>::IntToString(value); +} + +std::string NumberToString(long value) { + return IntToStringT<std::string, long>::IntToString(value); +} + +string16 NumberToString16(long value) { + return IntToStringT<string16, long>::IntToString(value); +} + +std::string NumberToString(unsigned long value) { + return IntToStringT<std::string, unsigned long>::IntToString(value); +} + +string16 NumberToString16(unsigned long value) { + return IntToStringT<string16, unsigned long>::IntToString(value); +} + +std::string NumberToString(long long value) { + return IntToStringT<std::string, long long>::IntToString(value); +} + +string16 NumberToString16(long long value) { + return IntToStringT<string16, long long>::IntToString(value); +} + +std::string NumberToString(unsigned long long value) { + return IntToStringT<std::string, unsigned long long>::IntToString(value); +} + +string16 NumberToString16(unsigned long long value) { + return IntToStringT<string16, unsigned long long>::IntToString(value); +} + +std::string NumberToString(double value) { + // According to g_fmt.cc, it is sufficient to declare a buffer of size 32. + char buffer[32]; + dmg_fp::g_fmt(buffer, value); + return std::string(buffer); +} + +gurl_base::string16 NumberToString16(double value) { + // According to g_fmt.cc, it is sufficient to declare a buffer of size 32. + char buffer[32]; + dmg_fp::g_fmt(buffer, value); + + // The number will be ASCII. This creates the string using the "input + // iterator" variant which promotes from 8-bit to 16-bit via "=". + return gurl_base::string16(&buffer[0], &buffer[strlen(buffer)]); +} + +bool StringToInt(StringPiece input, int* output) { + return StringToIntImpl(input, output); +} + +bool StringToInt(StringPiece16 input, int* output) { + return String16ToIntImpl(input, output); +} + +bool StringToUint(StringPiece input, unsigned* output) { + return StringToIntImpl(input, output); +} + +bool StringToUint(StringPiece16 input, unsigned* output) { + return String16ToIntImpl(input, output); +} + +bool StringToInt64(StringPiece input, int64_t* output) { + return StringToIntImpl(input, output); +} + +bool StringToInt64(StringPiece16 input, int64_t* output) { + return String16ToIntImpl(input, output); +} + +bool StringToUint64(StringPiece input, uint64_t* output) { + return StringToIntImpl(input, output); +} + +bool StringToUint64(StringPiece16 input, uint64_t* output) { + return String16ToIntImpl(input, output); +} + +bool StringToSizeT(StringPiece input, size_t* output) { + return StringToIntImpl(input, output); +} + +bool StringToSizeT(StringPiece16 input, size_t* output) { + return String16ToIntImpl(input, output); +} + +bool StringToDouble(const std::string& input, double* output) { + // Thread-safe? It is on at least Mac, Linux, and Windows. + internal::ScopedClearLastError clear_errno; + + char* endptr = nullptr; + *output = dmg_fp::strtod(input.c_str(), &endptr); + + // Cases to return false: + // - If errno is ERANGE, there was an overflow or underflow. + // - If the input string is empty, there was nothing to parse. + // - If endptr does not point to the end of the string, there are either + // characters remaining in the string after a parsed number, or the string + // does not begin with a parseable number. endptr is compared to the + // expected end given the string's stated length to correctly catch cases + // where the string contains embedded NUL characters. + // - If the first character is a space, there was leading whitespace + return errno == 0 && + !input.empty() && + input.c_str() + input.length() == endptr && + !isspace(input[0]); +} + +// Note: if you need to add String16ToDouble, first ask yourself if it's +// really necessary. If it is, probably the best implementation here is to +// convert to 8-bit and then use the 8-bit version. + +// Note: if you need to add an iterator range version of StringToDouble, first +// ask yourself if it's really necessary. If it is, probably the best +// implementation here is to instantiate a string and use the string version. + +std::string HexEncode(const void* bytes, size_t size) { + static const char kHexChars[] = "0123456789ABCDEF"; + + // Each input byte creates two output hex characters. + std::string ret(size * 2, '\0'); + + for (size_t i = 0; i < size; ++i) { + char b = reinterpret_cast<const char*>(bytes)[i]; + ret[(i * 2)] = kHexChars[(b >> 4) & 0xf]; + ret[(i * 2) + 1] = kHexChars[b & 0xf]; + } + return ret; +} + +std::string HexEncode(gurl_base::span<const uint8_t> bytes) { + return HexEncode(bytes.data(), bytes.size()); +} + +bool HexStringToInt(StringPiece input, int* output) { + return IteratorRangeToNumber<HexIteratorRangeToIntTraits>::Invoke( + input.begin(), input.end(), output); +} + +bool HexStringToUInt(StringPiece input, uint32_t* output) { + return IteratorRangeToNumber<HexIteratorRangeToUIntTraits>::Invoke( + input.begin(), input.end(), output); +} + +bool HexStringToInt64(StringPiece input, int64_t* output) { + return IteratorRangeToNumber<HexIteratorRangeToInt64Traits>::Invoke( + input.begin(), input.end(), output); +} + +bool HexStringToUInt64(StringPiece input, uint64_t* output) { + return IteratorRangeToNumber<HexIteratorRangeToUInt64Traits>::Invoke( + input.begin(), input.end(), output); +} + +bool HexStringToBytes(StringPiece input, std::vector<uint8_t>* output) { + GURL_DCHECK_EQ(output->size(), 0u); + size_t count = input.size(); + if (count == 0 || (count % 2) != 0) + return false; + for (uintptr_t i = 0; i < count / 2; ++i) { + uint8_t msb = 0; // most significant 4 bits + uint8_t lsb = 0; // least significant 4 bits + if (!CharToDigit<16>(input[i * 2], &msb) || + !CharToDigit<16>(input[i * 2 + 1], &lsb)) { + return false; + } + output->push_back((msb << 4) | lsb); + } + return true; +} + +} // namespace base
diff --git a/base/strings/string_number_conversions.h b/base/strings/string_number_conversions.h new file mode 100644 index 0000000..a3acab8 --- /dev/null +++ b/base/strings/string_number_conversions.h
@@ -0,0 +1,145 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_STRING_NUMBER_CONVERSIONS_H_ +#define BASE_STRINGS_STRING_NUMBER_CONVERSIONS_H_ + +#include <stddef.h> +#include <stdint.h> + +#include <string> +#include <vector> + +#include "polyfills/base/base_export.h" +#include "base/containers/span.h" +#include "base/strings/string16.h" +#include "base/strings/string_piece.h" +#include "build/build_config.h" + +// ---------------------------------------------------------------------------- +// IMPORTANT MESSAGE FROM YOUR SPONSOR +// +// This file contains no "wstring" variants. New code should use string16. If +// you need to make old code work, use the UTF8 version and convert. Please do +// not add wstring variants. +// +// Please do not add "convenience" functions for converting strings to integers +// that return the value and ignore success/failure. That encourages people to +// write code that doesn't properly handle the error conditions. +// +// DO NOT use these functions in any UI unless it's NOT localized on purpose. +// Instead, use gurl_base::MessageFormatter for a complex message with numbers +// (integer, float, double) embedded or gurl_base::Format{Number,Double,Percent} to +// just format a single number/percent. Note that some languages use native +// digits instead of ASCII digits while others use a group separator or decimal +// point different from ',' and '.'. Using these functions in the UI would lead +// numbers to be formatted in a non-native way. +// ---------------------------------------------------------------------------- + +namespace gurl_base { + +// Number -> string conversions ------------------------------------------------ + +// Ignores locale! see warning above. +BASE_EXPORT std::string NumberToString(int value); +BASE_EXPORT string16 NumberToString16(int value); +BASE_EXPORT std::string NumberToString(unsigned int value); +BASE_EXPORT string16 NumberToString16(unsigned int value); +BASE_EXPORT std::string NumberToString(long value); +BASE_EXPORT string16 NumberToString16(long value); +BASE_EXPORT std::string NumberToString(unsigned long value); +BASE_EXPORT string16 NumberToString16(unsigned long value); +BASE_EXPORT std::string NumberToString(long long value); +BASE_EXPORT string16 NumberToString16(long long value); +BASE_EXPORT std::string NumberToString(unsigned long long value); +BASE_EXPORT string16 NumberToString16(unsigned long long value); +BASE_EXPORT std::string NumberToString(double value); +BASE_EXPORT string16 NumberToString16(double value); + +// String -> number conversions ------------------------------------------------ + +// Perform a best-effort conversion of the input string to a numeric type, +// setting |*output| to the result of the conversion. Returns true for +// "perfect" conversions; returns false in the following cases: +// - Overflow. |*output| will be set to the maximum value supported +// by the data type. +// - Underflow. |*output| will be set to the minimum value supported +// by the data type. +// - Trailing characters in the string after parsing the number. |*output| +// will be set to the value of the number that was parsed. +// - Leading whitespace in the string before parsing the number. |*output| will +// be set to the value of the number that was parsed. +// - No characters parseable as a number at the beginning of the string. +// |*output| will be set to 0. +// - Empty string. |*output| will be set to 0. +// WARNING: Will write to |output| even when returning false. +// Read the comments above carefully. +BASE_EXPORT bool StringToInt(StringPiece input, int* output); +BASE_EXPORT bool StringToInt(StringPiece16 input, int* output); + +BASE_EXPORT bool StringToUint(StringPiece input, unsigned* output); +BASE_EXPORT bool StringToUint(StringPiece16 input, unsigned* output); + +BASE_EXPORT bool StringToInt64(StringPiece input, int64_t* output); +BASE_EXPORT bool StringToInt64(StringPiece16 input, int64_t* output); + +BASE_EXPORT bool StringToUint64(StringPiece input, uint64_t* output); +BASE_EXPORT bool StringToUint64(StringPiece16 input, uint64_t* output); + +BASE_EXPORT bool StringToSizeT(StringPiece input, size_t* output); +BASE_EXPORT bool StringToSizeT(StringPiece16 input, size_t* output); + +// For floating-point conversions, only conversions of input strings in decimal +// form are defined to work. Behavior with strings representing floating-point +// numbers in hexadecimal, and strings representing non-finite values (such as +// NaN and inf) is undefined. Otherwise, these behave the same as the integral +// variants. This expects the input string to NOT be specific to the locale. +// If your input is locale specific, use ICU to read the number. +// WARNING: Will write to |output| even when returning false. +// Read the comments here and above StringToInt() carefully. +BASE_EXPORT bool StringToDouble(const std::string& input, double* output); + +// Hex encoding ---------------------------------------------------------------- + +// Returns a hex string representation of a binary buffer. The returned hex +// string will be in upper case. This function does not check if |size| is +// within reasonable limits since it's written with trusted data in mind. If +// you suspect that the data you want to format might be large, the absolute +// max size for |size| should be is +// std::numeric_limits<size_t>::max() / 2 +BASE_EXPORT std::string HexEncode(const void* bytes, size_t size); +BASE_EXPORT std::string HexEncode(gurl_base::span<const uint8_t> bytes); + +// Best effort conversion, see StringToInt above for restrictions. +// Will only successful parse hex values that will fit into |output|, i.e. +// -0x80000000 < |input| < 0x7FFFFFFF. +BASE_EXPORT bool HexStringToInt(StringPiece input, int* output); + +// Best effort conversion, see StringToInt above for restrictions. +// Will only successful parse hex values that will fit into |output|, i.e. +// 0x00000000 < |input| < 0xFFFFFFFF. +// The string is not required to start with 0x. +BASE_EXPORT bool HexStringToUInt(StringPiece input, uint32_t* output); + +// Best effort conversion, see StringToInt above for restrictions. +// Will only successful parse hex values that will fit into |output|, i.e. +// -0x8000000000000000 < |input| < 0x7FFFFFFFFFFFFFFF. +BASE_EXPORT bool HexStringToInt64(StringPiece input, int64_t* output); + +// Best effort conversion, see StringToInt above for restrictions. +// Will only successful parse hex values that will fit into |output|, i.e. +// 0x0000000000000000 < |input| < 0xFFFFFFFFFFFFFFFF. +// The string is not required to start with 0x. +BASE_EXPORT bool HexStringToUInt64(StringPiece input, uint64_t* output); + +// Similar to the previous functions, except that output is a vector of bytes. +// |*output| will contain as many bytes as were successfully parsed prior to the +// error. There is no overflow, but input.size() must be evenly divisible by 2. +// Leading 0x or +/- are not allowed. +BASE_EXPORT bool HexStringToBytes(StringPiece input, + std::vector<uint8_t>* output); + +} // namespace base + +#endif // BASE_STRINGS_STRING_NUMBER_CONVERSIONS_H_
diff --git a/base/strings/string_number_conversions_fuzzer.cc b/base/strings/string_number_conversions_fuzzer.cc new file mode 100644 index 0000000..012887a --- /dev/null +++ b/base/strings/string_number_conversions_fuzzer.cc
@@ -0,0 +1,118 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <stddef.h> +#include <stdint.h> + +#include <string> +#include <vector> + +#include "base/strings/string_number_conversions.h" + +template <class NumberType, class StringPieceType, class StringType> +void CheckRoundtripsT(const uint8_t* data, + const size_t size, + StringType (*num_to_string)(NumberType), + bool (*string_to_num)(StringPieceType, NumberType*)) { + // Ensure we can read a NumberType from |data| + if (size < sizeof(NumberType)) + return; + const NumberType v1 = *reinterpret_cast<const NumberType*>(data); + + // Because we started with an arbitrary NumberType value, not an arbitrary + // string, we expect that the function |string_to_num| (e.g. StringToInt) will + // return true, indicating a perfect conversion. + NumberType v2; + GURL_CHECK(string_to_num(num_to_string(v1), &v2)); + + // Given that this was a perfect conversion, we expect the original NumberType + // value to equal the newly parsed one. + GURL_CHECK_EQ(v1, v2); +} + +template <class NumberType> +void CheckRoundtrips(const uint8_t* data, + const size_t size, + bool (*string_to_num)(gurl_base::StringPiece, NumberType*)) { + return CheckRoundtripsT<NumberType, gurl_base::StringPiece, std::string>( + data, size, &gurl_base::NumberToString, string_to_num); +} + +template <class NumberType> +void CheckRoundtrips16(const uint8_t* data, + const size_t size, + bool (*string_to_num)(gurl_base::StringPiece16, + NumberType*)) { + return CheckRoundtripsT<NumberType, gurl_base::StringPiece16, gurl_base::string16>( + data, size, &gurl_base::NumberToString16, string_to_num); +} + +// Entry point for LibFuzzer. +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + // For each instantiation of NumberToString f and its corresponding StringTo* + // function g, check that f(g(x)) = x holds for fuzzer-determined values of x. + CheckRoundtrips<int>(data, size, &gurl_base::StringToInt); + CheckRoundtrips16<int>(data, size, &gurl_base::StringToInt); + CheckRoundtrips<unsigned int>(data, size, &gurl_base::StringToUint); + CheckRoundtrips16<unsigned int>(data, size, &gurl_base::StringToUint); + CheckRoundtrips<int64_t>(data, size, &gurl_base::StringToInt64); + CheckRoundtrips16<int64_t>(data, size, &gurl_base::StringToInt64); + CheckRoundtrips<uint64_t>(data, size, &gurl_base::StringToUint64); + CheckRoundtrips16<uint64_t>(data, size, &gurl_base::StringToUint64); + CheckRoundtrips<size_t>(data, size, &gurl_base::StringToSizeT); + CheckRoundtrips16<size_t>(data, size, &gurl_base::StringToSizeT); + + gurl_base::StringPiece string_piece_input(reinterpret_cast<const char*>(data), + size); + std::string string_input(reinterpret_cast<const char*>(data), size); + + int out_int; + gurl_base::StringToInt(string_piece_input, &out_int); + unsigned out_uint; + gurl_base::StringToUint(string_piece_input, &out_uint); + int64_t out_int64; + gurl_base::StringToInt64(string_piece_input, &out_int64); + uint64_t out_uint64; + gurl_base::StringToUint64(string_piece_input, &out_uint64); + size_t out_size; + gurl_base::StringToSizeT(string_piece_input, &out_size); + + // Test for StringPiece16 if size is even. + if (size % 2 == 0) { + gurl_base::StringPiece16 string_piece_input16( + reinterpret_cast<const gurl_base::char16*>(data), size / 2); + + gurl_base::StringToInt(string_piece_input16, &out_int); + gurl_base::StringToUint(string_piece_input16, &out_uint); + gurl_base::StringToInt64(string_piece_input16, &out_int64); + gurl_base::StringToUint64(string_piece_input16, &out_uint64); + gurl_base::StringToSizeT(string_piece_input16, &out_size); + } + + double out_double; + gurl_base::StringToDouble(string_input, &out_double); + + gurl_base::HexStringToInt(string_piece_input, &out_int); + gurl_base::HexStringToUInt(string_piece_input, &out_uint); + gurl_base::HexStringToInt64(string_piece_input, &out_int64); + gurl_base::HexStringToUInt64(string_piece_input, &out_uint64); + std::vector<uint8_t> out_bytes; + gurl_base::HexStringToBytes(string_piece_input, &out_bytes); + + gurl_base::HexEncode(data, size); + + // Convert the numbers back to strings. + gurl_base::NumberToString(out_int); + gurl_base::NumberToString16(out_int); + gurl_base::NumberToString(out_uint); + gurl_base::NumberToString16(out_uint); + gurl_base::NumberToString(out_int64); + gurl_base::NumberToString16(out_int64); + gurl_base::NumberToString(out_uint64); + gurl_base::NumberToString16(out_uint64); + gurl_base::NumberToString(out_double); + gurl_base::NumberToString16(out_double); + + return 0; +}
diff --git a/base/strings/string_number_conversions_unittest.cc b/base/strings/string_number_conversions_unittest.cc new file mode 100644 index 0000000..93405e2 --- /dev/null +++ b/base/strings/string_number_conversions_unittest.cc
@@ -0,0 +1,901 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string_number_conversions.h" + +#include <errno.h> +#include <limits.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> + +#include <cmath> +#include <limits> + +#include "base/bit_cast.h" +#include "base/format_macros.h" +#include "base/stl_util.h" +#include "base/strings/stringprintf.h" +#include "base/strings/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace gurl_base { + +namespace { + +template <typename INT> +struct NumberToStringTest { + INT num; + const char* sexpected; + const char* uexpected; +}; + +} // namespace + +TEST(StringNumberConversionsTest, NumberToString) { + static const NumberToStringTest<int> int_tests[] = { + {0, "0", "0"}, + {-1, "-1", "4294967295"}, + {std::numeric_limits<int>::max(), "2147483647", "2147483647"}, + {std::numeric_limits<int>::min(), "-2147483648", "2147483648"}, + }; + static const NumberToStringTest<int64_t> int64_tests[] = { + {0, "0", "0"}, + {-1, "-1", "18446744073709551615"}, + { + std::numeric_limits<int64_t>::max(), "9223372036854775807", + "9223372036854775807", + }, + {std::numeric_limits<int64_t>::min(), "-9223372036854775808", + "9223372036854775808"}, + }; + + for (const auto& test : int_tests) { + EXPECT_EQ(NumberToString(test.num), test.sexpected); + EXPECT_EQ(NumberToString16(test.num), UTF8ToUTF16(test.sexpected)); + EXPECT_EQ(NumberToString(static_cast<unsigned>(test.num)), test.uexpected); + EXPECT_EQ(NumberToString16(static_cast<unsigned>(test.num)), + UTF8ToUTF16(test.uexpected)); + } + for (const auto& test : int64_tests) { + EXPECT_EQ(NumberToString(test.num), test.sexpected); + EXPECT_EQ(NumberToString16(test.num), UTF8ToUTF16(test.sexpected)); + EXPECT_EQ(NumberToString(static_cast<uint64_t>(test.num)), test.uexpected); + EXPECT_EQ(NumberToString16(static_cast<uint64_t>(test.num)), + UTF8ToUTF16(test.uexpected)); + } +} + +TEST(StringNumberConversionsTest, Uint64ToString) { + static const struct { + uint64_t input; + std::string output; + } cases[] = { + {0, "0"}, + {42, "42"}, + {INT_MAX, "2147483647"}, + {std::numeric_limits<uint64_t>::max(), "18446744073709551615"}, + }; + + for (const auto& i : cases) + EXPECT_EQ(i.output, NumberToString(i.input)); +} + +TEST(StringNumberConversionsTest, SizeTToString) { + size_t size_t_max = std::numeric_limits<size_t>::max(); + std::string size_t_max_string = StringPrintf("%" PRIuS, size_t_max); + + static const struct { + size_t input; + std::string output; + } cases[] = { + {0, "0"}, + {9, "9"}, + {42, "42"}, + {INT_MAX, "2147483647"}, + {2147483648U, "2147483648"}, +#if SIZE_MAX > 4294967295U + {99999999999U, "99999999999"}, +#endif + {size_t_max, size_t_max_string}, + }; + + for (const auto& i : cases) + EXPECT_EQ(i.output, NumberToString(i.input)); +} + +TEST(StringNumberConversionsTest, StringToInt) { + static const struct { + std::string input; + int output; + bool success; + } cases[] = { + {"0", 0, true}, + {"42", 42, true}, + {"42\x99", 42, false}, + {"\x99" "42\x99", 0, false}, + {"-2147483648", INT_MIN, true}, + {"2147483647", INT_MAX, true}, + {"", 0, false}, + {" 42", 42, false}, + {"42 ", 42, false}, + {"\t\n\v\f\r 42", 42, false}, + {"blah42", 0, false}, + {"42blah", 42, false}, + {"blah42blah", 0, false}, + {"-273.15", -273, false}, + {"+98.6", 98, false}, + {"--123", 0, false}, + {"++123", 0, false}, + {"-+123", 0, false}, + {"+-123", 0, false}, + {"-", 0, false}, + {"-2147483649", INT_MIN, false}, + {"-99999999999", INT_MIN, false}, + {"2147483648", INT_MAX, false}, + {"99999999999", INT_MAX, false}, + }; + + for (const auto& i : cases) { + int output = i.output ^ 1; // Ensure StringToInt wrote something. + EXPECT_EQ(i.success, StringToInt(i.input, &output)); + EXPECT_EQ(i.output, output); + + string16 utf16_input = UTF8ToUTF16(i.input); + output = i.output ^ 1; // Ensure StringToInt wrote something. + EXPECT_EQ(i.success, StringToInt(utf16_input, &output)); + EXPECT_EQ(i.output, output); + } + + // One additional test to verify that conversion of numbers in strings with + // embedded NUL characters. The NUL and extra data after it should be + // interpreted as junk after the number. + const char input[] = "6\06"; + std::string input_string(input, gurl_base::size(input) - 1); + int output; + EXPECT_FALSE(StringToInt(input_string, &output)); + EXPECT_EQ(6, output); + + string16 utf16_input = UTF8ToUTF16(input_string); + output = 0; + EXPECT_FALSE(StringToInt(utf16_input, &output)); + EXPECT_EQ(6, output); + + output = 0; + const char16 negative_wide_input[] = { 0xFF4D, '4', '2', 0}; + EXPECT_FALSE(StringToInt(string16(negative_wide_input), &output)); + EXPECT_EQ(0, output); +} + +TEST(StringNumberConversionsTest, StringToUint) { + static const struct { + std::string input; + unsigned output; + bool success; + } cases[] = { + {"0", 0, true}, + {"42", 42, true}, + {"42\x99", 42, false}, + {"\x99" "42\x99", 0, false}, + {"-2147483648", 0, false}, + {"2147483647", INT_MAX, true}, + {"", 0, false}, + {" 42", 42, false}, + {"42 ", 42, false}, + {"\t\n\v\f\r 42", 42, false}, + {"blah42", 0, false}, + {"42blah", 42, false}, + {"blah42blah", 0, false}, + {"-273.15", 0, false}, + {"+98.6", 98, false}, + {"--123", 0, false}, + {"++123", 0, false}, + {"-+123", 0, false}, + {"+-123", 0, false}, + {"-", 0, false}, + {"-2147483649", 0, false}, + {"-99999999999", 0, false}, + {"4294967295", UINT_MAX, true}, + {"4294967296", UINT_MAX, false}, + {"99999999999", UINT_MAX, false}, + }; + + for (const auto& i : cases) { + unsigned output = i.output ^ 1; // Ensure StringToUint wrote something. + EXPECT_EQ(i.success, StringToUint(i.input, &output)); + EXPECT_EQ(i.output, output); + + string16 utf16_input = UTF8ToUTF16(i.input); + output = i.output ^ 1; // Ensure StringToUint wrote something. + EXPECT_EQ(i.success, StringToUint(utf16_input, &output)); + EXPECT_EQ(i.output, output); + } + + // One additional test to verify that conversion of numbers in strings with + // embedded NUL characters. The NUL and extra data after it should be + // interpreted as junk after the number. + const char input[] = "6\06"; + std::string input_string(input, gurl_base::size(input) - 1); + unsigned output; + EXPECT_FALSE(StringToUint(input_string, &output)); + EXPECT_EQ(6U, output); + + string16 utf16_input = UTF8ToUTF16(input_string); + output = 0; + EXPECT_FALSE(StringToUint(utf16_input, &output)); + EXPECT_EQ(6U, output); + + output = 0; + const char16 negative_wide_input[] = { 0xFF4D, '4', '2', 0}; + EXPECT_FALSE(StringToUint(string16(negative_wide_input), &output)); + EXPECT_EQ(0U, output); +} + +TEST(StringNumberConversionsTest, StringToInt64) { + static const struct { + std::string input; + int64_t output; + bool success; + } cases[] = { + {"0", 0, true}, + {"42", 42, true}, + {"-2147483648", INT_MIN, true}, + {"2147483647", INT_MAX, true}, + {"-2147483649", INT64_C(-2147483649), true}, + {"-99999999999", INT64_C(-99999999999), true}, + {"2147483648", INT64_C(2147483648), true}, + {"99999999999", INT64_C(99999999999), true}, + {"9223372036854775807", std::numeric_limits<int64_t>::max(), true}, + {"-9223372036854775808", std::numeric_limits<int64_t>::min(), true}, + {"09", 9, true}, + {"-09", -9, true}, + {"", 0, false}, + {" 42", 42, false}, + {"42 ", 42, false}, + {"0x42", 0, false}, + {"\t\n\v\f\r 42", 42, false}, + {"blah42", 0, false}, + {"42blah", 42, false}, + {"blah42blah", 0, false}, + {"-273.15", -273, false}, + {"+98.6", 98, false}, + {"--123", 0, false}, + {"++123", 0, false}, + {"-+123", 0, false}, + {"+-123", 0, false}, + {"-", 0, false}, + {"-9223372036854775809", std::numeric_limits<int64_t>::min(), false}, + {"-99999999999999999999", std::numeric_limits<int64_t>::min(), false}, + {"9223372036854775808", std::numeric_limits<int64_t>::max(), false}, + {"99999999999999999999", std::numeric_limits<int64_t>::max(), false}, + }; + + for (const auto& i : cases) { + int64_t output = 0; + EXPECT_EQ(i.success, StringToInt64(i.input, &output)); + EXPECT_EQ(i.output, output); + + string16 utf16_input = UTF8ToUTF16(i.input); + output = 0; + EXPECT_EQ(i.success, StringToInt64(utf16_input, &output)); + EXPECT_EQ(i.output, output); + } + + // One additional test to verify that conversion of numbers in strings with + // embedded NUL characters. The NUL and extra data after it should be + // interpreted as junk after the number. + const char input[] = "6\06"; + std::string input_string(input, gurl_base::size(input) - 1); + int64_t output; + EXPECT_FALSE(StringToInt64(input_string, &output)); + EXPECT_EQ(6, output); + + string16 utf16_input = UTF8ToUTF16(input_string); + output = 0; + EXPECT_FALSE(StringToInt64(utf16_input, &output)); + EXPECT_EQ(6, output); +} + +TEST(StringNumberConversionsTest, StringToUint64) { + static const struct { + std::string input; + uint64_t output; + bool success; + } cases[] = { + {"0", 0, true}, + {"42", 42, true}, + {"-2147483648", 0, false}, + {"2147483647", INT_MAX, true}, + {"-2147483649", 0, false}, + {"-99999999999", 0, false}, + {"2147483648", UINT64_C(2147483648), true}, + {"99999999999", UINT64_C(99999999999), true}, + {"9223372036854775807", std::numeric_limits<int64_t>::max(), true}, + {"-9223372036854775808", 0, false}, + {"09", 9, true}, + {"-09", 0, false}, + {"", 0, false}, + {" 42", 42, false}, + {"42 ", 42, false}, + {"0x42", 0, false}, + {"\t\n\v\f\r 42", 42, false}, + {"blah42", 0, false}, + {"42blah", 42, false}, + {"blah42blah", 0, false}, + {"-273.15", 0, false}, + {"+98.6", 98, false}, + {"--123", 0, false}, + {"++123", 0, false}, + {"-+123", 0, false}, + {"+-123", 0, false}, + {"-", 0, false}, + {"-9223372036854775809", 0, false}, + {"-99999999999999999999", 0, false}, + {"9223372036854775808", UINT64_C(9223372036854775808), true}, + {"99999999999999999999", std::numeric_limits<uint64_t>::max(), false}, + {"18446744073709551615", std::numeric_limits<uint64_t>::max(), true}, + {"18446744073709551616", std::numeric_limits<uint64_t>::max(), false}, + }; + + for (const auto& i : cases) { + uint64_t output = 0; + EXPECT_EQ(i.success, StringToUint64(i.input, &output)); + EXPECT_EQ(i.output, output); + + string16 utf16_input = UTF8ToUTF16(i.input); + output = 0; + EXPECT_EQ(i.success, StringToUint64(utf16_input, &output)); + EXPECT_EQ(i.output, output); + } + + // One additional test to verify that conversion of numbers in strings with + // embedded NUL characters. The NUL and extra data after it should be + // interpreted as junk after the number. + const char input[] = "6\06"; + std::string input_string(input, gurl_base::size(input) - 1); + uint64_t output; + EXPECT_FALSE(StringToUint64(input_string, &output)); + EXPECT_EQ(6U, output); + + string16 utf16_input = UTF8ToUTF16(input_string); + output = 0; + EXPECT_FALSE(StringToUint64(utf16_input, &output)); + EXPECT_EQ(6U, output); +} + +TEST(StringNumberConversionsTest, StringToSizeT) { + size_t size_t_max = std::numeric_limits<size_t>::max(); + std::string size_t_max_string = StringPrintf("%" PRIuS, size_t_max); + + static const struct { + std::string input; + size_t output; + bool success; + } cases[] = { + {"0", 0, true}, + {"42", 42, true}, + {"-2147483648", 0, false}, + {"2147483647", INT_MAX, true}, + {"-2147483649", 0, false}, + {"-99999999999", 0, false}, + {"2147483648", 2147483648U, true}, +#if SIZE_MAX > 4294967295U + {"99999999999", 99999999999U, true}, +#endif + {"-9223372036854775808", 0, false}, + {"09", 9, true}, + {"-09", 0, false}, + {"", 0, false}, + {" 42", 42, false}, + {"42 ", 42, false}, + {"0x42", 0, false}, + {"\t\n\v\f\r 42", 42, false}, + {"blah42", 0, false}, + {"42blah", 42, false}, + {"blah42blah", 0, false}, + {"-273.15", 0, false}, + {"+98.6", 98, false}, + {"--123", 0, false}, + {"++123", 0, false}, + {"-+123", 0, false}, + {"+-123", 0, false}, + {"-", 0, false}, + {"-9223372036854775809", 0, false}, + {"-99999999999999999999", 0, false}, + {"999999999999999999999999", size_t_max, false}, + {size_t_max_string, size_t_max, true}, + }; + + for (const auto& i : cases) { + size_t output = 0; + EXPECT_EQ(i.success, StringToSizeT(i.input, &output)); + EXPECT_EQ(i.output, output); + + string16 utf16_input = UTF8ToUTF16(i.input); + output = 0; + EXPECT_EQ(i.success, StringToSizeT(utf16_input, &output)); + EXPECT_EQ(i.output, output); + } + + // One additional test to verify that conversion of numbers in strings with + // embedded NUL characters. The NUL and extra data after it should be + // interpreted as junk after the number. + const char input[] = "6\06"; + std::string input_string(input, gurl_base::size(input) - 1); + size_t output; + EXPECT_FALSE(StringToSizeT(input_string, &output)); + EXPECT_EQ(6U, output); + + string16 utf16_input = UTF8ToUTF16(input_string); + output = 0; + EXPECT_FALSE(StringToSizeT(utf16_input, &output)); + EXPECT_EQ(6U, output); +} + +TEST(StringNumberConversionsTest, HexStringToInt) { + static const struct { + std::string input; + int64_t output; + bool success; + } cases[] = { + {"0", 0, true}, + {"42", 66, true}, + {"-42", -66, true}, + {"+42", 66, true}, + {"7fffffff", INT_MAX, true}, + {"-80000000", INT_MIN, true}, + {"80000000", INT_MAX, false}, // Overflow test. + {"-80000001", INT_MIN, false}, // Underflow test. + {"0x42", 66, true}, + {"-0x42", -66, true}, + {"+0x42", 66, true}, + {"0x7fffffff", INT_MAX, true}, + {"-0x80000000", INT_MIN, true}, + {"-80000000", INT_MIN, true}, + {"80000000", INT_MAX, false}, // Overflow test. + {"-80000001", INT_MIN, false}, // Underflow test. + {"0x0f", 15, true}, + {"0f", 15, true}, + {" 45", 0x45, false}, + {"\t\n\v\f\r 0x45", 0x45, false}, + {" 45", 0x45, false}, + {"45 ", 0x45, false}, + {"45:", 0x45, false}, + {"efgh", 0xef, false}, + {"0xefgh", 0xef, false}, + {"hgfe", 0, false}, + {"-", 0, false}, + {"", 0, false}, + {"0x", 0, false}, + }; + + for (const auto& i : cases) { + int output = 0; + EXPECT_EQ(i.success, HexStringToInt(i.input, &output)); + EXPECT_EQ(i.output, output); + } + // One additional test to verify that conversion of numbers in strings with + // embedded NUL characters. The NUL and extra data after it should be + // interpreted as junk after the number. + const char input[] = "0xc0ffee\0" "9"; + std::string input_string(input, gurl_base::size(input) - 1); + int output; + EXPECT_FALSE(HexStringToInt(input_string, &output)); + EXPECT_EQ(0xc0ffee, output); +} + +TEST(StringNumberConversionsTest, HexStringToUInt) { + static const struct { + std::string input; + uint32_t output; + bool success; + } cases[] = { + {"0", 0, true}, + {"42", 0x42, true}, + {"-42", 0, false}, + {"+42", 0x42, true}, + {"7fffffff", INT_MAX, true}, + {"-80000000", 0, false}, + {"ffffffff", 0xffffffff, true}, + {"DeadBeef", 0xdeadbeef, true}, + {"0x42", 0x42, true}, + {"-0x42", 0, false}, + {"+0x42", 0x42, true}, + {"0x7fffffff", INT_MAX, true}, + {"-0x80000000", 0, false}, + {"0xffffffff", std::numeric_limits<uint32_t>::max(), true}, + {"0XDeadBeef", 0xdeadbeef, true}, + {"0x7fffffffffffffff", std::numeric_limits<uint32_t>::max(), + false}, // Overflow test. + {"-0x8000000000000000", 0, false}, + {"0x8000000000000000", std::numeric_limits<uint32_t>::max(), + false}, // Overflow test. + {"-0x8000000000000001", 0, false}, + {"0xFFFFFFFFFFFFFFFF", std::numeric_limits<uint32_t>::max(), + false}, // Overflow test. + {"FFFFFFFFFFFFFFFF", std::numeric_limits<uint32_t>::max(), + false}, // Overflow test. + {"0x0000000000000000", 0, true}, + {"0000000000000000", 0, true}, + {"1FFFFFFFFFFFFFFFF", std::numeric_limits<uint32_t>::max(), + false}, // Overflow test. + {"0x0f", 0x0f, true}, + {"0f", 0x0f, true}, + {" 45", 0x45, false}, + {"\t\n\v\f\r 0x45", 0x45, false}, + {" 45", 0x45, false}, + {"45 ", 0x45, false}, + {"45:", 0x45, false}, + {"efgh", 0xef, false}, + {"0xefgh", 0xef, false}, + {"hgfe", 0, false}, + {"-", 0, false}, + {"", 0, false}, + {"0x", 0, false}, + }; + + for (const auto& i : cases) { + uint32_t output = 0; + EXPECT_EQ(i.success, HexStringToUInt(i.input, &output)); + EXPECT_EQ(i.output, output); + } + // One additional test to verify that conversion of numbers in strings with + // embedded NUL characters. The NUL and extra data after it should be + // interpreted as junk after the number. + const char input[] = "0xc0ffee\0" "9"; + std::string input_string(input, gurl_base::size(input) - 1); + uint32_t output; + EXPECT_FALSE(HexStringToUInt(input_string, &output)); + EXPECT_EQ(0xc0ffeeU, output); +} + +TEST(StringNumberConversionsTest, HexStringToInt64) { + static const struct { + std::string input; + int64_t output; + bool success; + } cases[] = { + {"0", 0, true}, + {"42", 66, true}, + {"-42", -66, true}, + {"+42", 66, true}, + {"40acd88557b", INT64_C(4444444448123), true}, + {"7fffffff", INT_MAX, true}, + {"-80000000", INT_MIN, true}, + {"ffffffff", 0xffffffff, true}, + {"DeadBeef", 0xdeadbeef, true}, + {"0x42", 66, true}, + {"-0x42", -66, true}, + {"+0x42", 66, true}, + {"0x40acd88557b", INT64_C(4444444448123), true}, + {"0x7fffffff", INT_MAX, true}, + {"-0x80000000", INT_MIN, true}, + {"0xffffffff", 0xffffffff, true}, + {"0XDeadBeef", 0xdeadbeef, true}, + {"0x7fffffffffffffff", std::numeric_limits<int64_t>::max(), true}, + {"-0x8000000000000000", std::numeric_limits<int64_t>::min(), true}, + {"0x8000000000000000", std::numeric_limits<int64_t>::max(), + false}, // Overflow test. + {"-0x8000000000000001", std::numeric_limits<int64_t>::min(), + false}, // Underflow test. + {"0x0f", 15, true}, + {"0f", 15, true}, + {" 45", 0x45, false}, + {"\t\n\v\f\r 0x45", 0x45, false}, + {" 45", 0x45, false}, + {"45 ", 0x45, false}, + {"45:", 0x45, false}, + {"efgh", 0xef, false}, + {"0xefgh", 0xef, false}, + {"hgfe", 0, false}, + {"-", 0, false}, + {"", 0, false}, + {"0x", 0, false}, + }; + + for (const auto& i : cases) { + int64_t output = 0; + EXPECT_EQ(i.success, HexStringToInt64(i.input, &output)); + EXPECT_EQ(i.output, output); + } + // One additional test to verify that conversion of numbers in strings with + // embedded NUL characters. The NUL and extra data after it should be + // interpreted as junk after the number. + const char input[] = "0xc0ffee\0" "9"; + std::string input_string(input, gurl_base::size(input) - 1); + int64_t output; + EXPECT_FALSE(HexStringToInt64(input_string, &output)); + EXPECT_EQ(0xc0ffee, output); +} + +TEST(StringNumberConversionsTest, HexStringToUInt64) { + static const struct { + std::string input; + uint64_t output; + bool success; + } cases[] = { + {"0", 0, true}, + {"42", 66, true}, + {"-42", 0, false}, + {"+42", 66, true}, + {"40acd88557b", INT64_C(4444444448123), true}, + {"7fffffff", INT_MAX, true}, + {"-80000000", 0, false}, + {"ffffffff", 0xffffffff, true}, + {"DeadBeef", 0xdeadbeef, true}, + {"0x42", 66, true}, + {"-0x42", 0, false}, + {"+0x42", 66, true}, + {"0x40acd88557b", INT64_C(4444444448123), true}, + {"0x7fffffff", INT_MAX, true}, + {"-0x80000000", 0, false}, + {"0xffffffff", 0xffffffff, true}, + {"0XDeadBeef", 0xdeadbeef, true}, + {"0x7fffffffffffffff", std::numeric_limits<int64_t>::max(), true}, + {"-0x8000000000000000", 0, false}, + {"0x8000000000000000", UINT64_C(0x8000000000000000), true}, + {"-0x8000000000000001", 0, false}, + {"0xFFFFFFFFFFFFFFFF", std::numeric_limits<uint64_t>::max(), true}, + {"FFFFFFFFFFFFFFFF", std::numeric_limits<uint64_t>::max(), true}, + {"0x0000000000000000", 0, true}, + {"0000000000000000", 0, true}, + {"1FFFFFFFFFFFFFFFF", std::numeric_limits<uint64_t>::max(), + false}, // Overflow test. + {"0x0f", 15, true}, + {"0f", 15, true}, + {" 45", 0x45, false}, + {"\t\n\v\f\r 0x45", 0x45, false}, + {" 45", 0x45, false}, + {"45 ", 0x45, false}, + {"45:", 0x45, false}, + {"efgh", 0xef, false}, + {"0xefgh", 0xef, false}, + {"hgfe", 0, false}, + {"-", 0, false}, + {"", 0, false}, + {"0x", 0, false}, + }; + + for (const auto& i : cases) { + uint64_t output = 0; + EXPECT_EQ(i.success, HexStringToUInt64(i.input, &output)); + EXPECT_EQ(i.output, output); + } + // One additional test to verify that conversion of numbers in strings with + // embedded NUL characters. The NUL and extra data after it should be + // interpreted as junk after the number. + const char input[] = "0xc0ffee\0" "9"; + std::string input_string(input, gurl_base::size(input) - 1); + uint64_t output; + EXPECT_FALSE(HexStringToUInt64(input_string, &output)); + EXPECT_EQ(0xc0ffeeU, output); +} + +TEST(StringNumberConversionsTest, HexStringToBytes) { + static const struct { + const std::string input; + const char* output; + size_t output_len; + bool success; + } cases[] = { + {"0", "", 0, false}, // odd number of characters fails + {"00", "\0", 1, true}, + {"42", "\x42", 1, true}, + {"-42", "", 0, false}, // any non-hex value fails + {"+42", "", 0, false}, + {"7fffffff", "\x7f\xff\xff\xff", 4, true}, + {"80000000", "\x80\0\0\0", 4, true}, + {"deadbeef", "\xde\xad\xbe\xef", 4, true}, + {"DeadBeef", "\xde\xad\xbe\xef", 4, true}, + {"0x42", "", 0, false}, // leading 0x fails (x is not hex) + {"0f", "\xf", 1, true}, + {"45 ", "\x45", 1, false}, + {"efgh", "\xef", 1, false}, + {"", "", 0, false}, + {"0123456789ABCDEF", "\x01\x23\x45\x67\x89\xAB\xCD\xEF", 8, true}, + {"0123456789ABCDEF012345", + "\x01\x23\x45\x67\x89\xAB\xCD\xEF\x01\x23\x45", 11, true}, + }; + + for (size_t i = 0; i < gurl_base::size(cases); ++i) { + std::vector<uint8_t> output; + std::vector<uint8_t> compare; + EXPECT_EQ(cases[i].success, HexStringToBytes(cases[i].input, &output)) << + i << ": " << cases[i].input; + for (size_t j = 0; j < cases[i].output_len; ++j) + compare.push_back(static_cast<uint8_t>(cases[i].output[j])); + ASSERT_EQ(output.size(), compare.size()) << i << ": " << cases[i].input; + EXPECT_TRUE(std::equal(output.begin(), output.end(), compare.begin())) << + i << ": " << cases[i].input; + } +} + +TEST(StringNumberConversionsTest, StringToDouble) { + static const struct { + std::string input; + double output; + bool success; + } cases[] = { + // Test different forms of zero. + {"0", 0.0, true}, + {"+0", 0.0, true}, + {"-0", 0.0, true}, + {"0.0", 0.0, true}, + {"000000000000000000000000000000.0", 0.0, true}, + {"0.000000000000000000000000000", 0.0, true}, + + // Test the answer. + {"42", 42.0, true}, + {"-42", -42.0, true}, + + // Test variances of an ordinary number. + {"123.45", 123.45, true}, + {"-123.45", -123.45, true}, + {"+123.45", 123.45, true}, + + // Test different forms of representation. + {"2.99792458e8", 299792458.0, true}, + {"149597870.691E+3", 149597870691.0, true}, + {"6.", 6.0, true}, + + // Test around the largest/smallest value that a double can represent. + {"9e307", 9e307, true}, + {"1.7976e308", 1.7976e308, true}, + {"1.7977e308", HUGE_VAL, false}, + {"1.797693134862315807e+308", HUGE_VAL, true}, + {"1.797693134862315808e+308", HUGE_VAL, false}, + {"9e308", HUGE_VAL, false}, + {"9e309", HUGE_VAL, false}, + {"9e999", HUGE_VAL, false}, + {"9e1999", HUGE_VAL, false}, + {"9e19999", HUGE_VAL, false}, + {"9e99999999999999999999", HUGE_VAL, false}, + {"-9e307", -9e307, true}, + {"-1.7976e308", -1.7976e308, true}, + {"-1.7977e308", -HUGE_VAL, false}, + {"-1.797693134862315807e+308", -HUGE_VAL, true}, + {"-1.797693134862315808e+308", -HUGE_VAL, false}, + {"-9e308", -HUGE_VAL, false}, + {"-9e309", -HUGE_VAL, false}, + {"-9e999", -HUGE_VAL, false}, + {"-9e1999", -HUGE_VAL, false}, + {"-9e19999", -HUGE_VAL, false}, + {"-9e99999999999999999999", -HUGE_VAL, false}, + + // Test more exponents. + {"1e-2", 0.01, true}, + {"42 ", 42.0, false}, + {" 1e-2", 0.01, false}, + {"1e-2 ", 0.01, false}, + {"-1E-7", -0.0000001, true}, + {"01e02", 100, true}, + {"2.3e15", 2.3e15, true}, + {"100e-309", 100e-309, true}, + + // Test some invalid cases. + {"\t\n\v\f\r -123.45e2", -12345.0, false}, + {"+123 e4", 123.0, false}, + {"123e ", 123.0, false}, + {"123e", 123.0, false}, + {" 2.99", 2.99, false}, + {"1e3.4", 1000.0, false}, + {"nothing", 0.0, false}, + {"-", 0.0, false}, + {"+", 0.0, false}, + {"", 0.0, false}, + + // crbug.org/588726 + {"-0.0010000000000000000000000000000000000000001e-256", + -1.0000000000000001e-259, true}, + }; + + for (size_t i = 0; i < gurl_base::size(cases); ++i) { + double output; + errno = 1; + EXPECT_EQ(cases[i].success, StringToDouble(cases[i].input, &output)); + if (cases[i].success) + EXPECT_EQ(1, errno) << i; // confirm that errno is unchanged. + EXPECT_DOUBLE_EQ(cases[i].output, output); + } + + // One additional test to verify that conversion of numbers in strings with + // embedded NUL characters. The NUL and extra data after it should be + // interpreted as junk after the number. + const char input[] = "3.14\0" "159"; + std::string input_string(input, gurl_base::size(input) - 1); + double output; + EXPECT_FALSE(StringToDouble(input_string, &output)); + EXPECT_DOUBLE_EQ(3.14, output); +} + +TEST(StringNumberConversionsTest, DoubleToString) { + static const struct { + double input; + const char* expected; + } cases[] = { + {0.0, "0"}, + {1.25, "1.25"}, + {1.33518e+012, "1.33518e+12"}, + {1.33489e+012, "1.33489e+12"}, + {1.33505e+012, "1.33505e+12"}, + {1.33545e+009, "1335450000"}, + {1.33503e+009, "1335030000"}, + }; + + for (const auto& i : cases) { + EXPECT_EQ(i.expected, NumberToString(i.input)); + EXPECT_EQ(i.expected, UTF16ToUTF8(NumberToString16(i.input))); + } + + // The following two values were seen in crashes in the wild. + const char input_bytes[8] = {0, 0, 0, 0, '\xee', '\x6d', '\x73', '\x42'}; + double input = 0; + memcpy(&input, input_bytes, gurl_base::size(input_bytes)); + EXPECT_EQ("1335179083776", NumberToString(input)); + const char input_bytes2[8] = + {0, 0, 0, '\xa0', '\xda', '\x6c', '\x73', '\x42'}; + input = 0; + memcpy(&input, input_bytes2, gurl_base::size(input_bytes2)); + EXPECT_EQ("1334890332160", NumberToString(input)); +} + +TEST(StringNumberConversionsTest, HexEncode) { + std::string hex(HexEncode(nullptr, 0)); + EXPECT_EQ(hex.length(), 0U); + unsigned char bytes[] = {0x01, 0xff, 0x02, 0xfe, 0x03, 0x80, 0x81}; + hex = HexEncode(bytes, sizeof(bytes)); + EXPECT_EQ(hex.compare("01FF02FE038081"), 0); +} + +// Test cases of known-bad strtod conversions that motivated the use of dmg_fp. +// See https://bugs.chromium.org/p/chromium/issues/detail?id=593512. +TEST(StringNumberConversionsTest, StrtodFailures) { + static const struct { + const char* input; + uint64_t expected; + } cases[] = { + // http://www.exploringbinary.com/incorrectly-rounded-conversions-in-visual-c-plus-plus/ + {"9214843084008499", 0x43405e6cec57761aULL}, + {"0.500000000000000166533453693773481063544750213623046875", + 0x3fe0000000000002ULL}, + {"30078505129381147446200", 0x44997a3c7271b021ULL}, + {"1777820000000000000001", 0x4458180d5bad2e3eULL}, + {"0.500000000000000166547006220929549868969843373633921146392822265625", + 0x3fe0000000000002ULL}, + {"0.50000000000000016656055874808561867439493653364479541778564453125", + 0x3fe0000000000002ULL}, + {"0.3932922657273", 0x3fd92bb352c4623aULL}, + + // http://www.exploringbinary.com/incorrectly-rounded-conversions-in-gcc-and-glibc/ + {"0.500000000000000166533453693773481063544750213623046875", + 0x3fe0000000000002ULL}, + {"3.518437208883201171875e13", 0x42c0000000000002ULL}, + {"62.5364939768271845828", 0x404f44abd5aa7ca4ULL}, + {"8.10109172351e-10", 0x3e0bd5cbaef0fd0cULL}, + {"1.50000000000000011102230246251565404236316680908203125", + 0x3ff8000000000000ULL}, + {"9007199254740991.4999999999999999999999999999999995", + 0x433fffffffffffffULL}, + + // http://www.exploringbinary.com/incorrect-decimal-to-floating-point-conversion-in-sqlite/ + {"1e-23", 0x3b282db34012b251ULL}, + {"8.533e+68", 0x4e3fa69165a8eea2ULL}, + {"4.1006e-184", 0x19dbe0d1c7ea60c9ULL}, + {"9.998e+307", 0x7fe1cc0a350ca87bULL}, + {"9.9538452227e-280", 0x0602117ae45cde43ULL}, + {"6.47660115e-260", 0x0a1fdd9e333badadULL}, + {"7.4e+47", 0x49e033d7eca0adefULL}, + {"5.92e+48", 0x4a1033d7eca0adefULL}, + {"7.35e+66", 0x4dd172b70eababa9ULL}, + {"8.32116e+55", 0x4b8b2628393e02cdULL}, + }; + + for (const auto& test : cases) { + double output; + EXPECT_TRUE(StringToDouble(test.input, &output)); + EXPECT_EQ(bit_cast<uint64_t>(output), test.expected); + } +} + +} // namespace base
diff --git a/base/strings/string_piece.cc b/base/strings/string_piece.cc new file mode 100644 index 0000000..68f3efc --- /dev/null +++ b/base/strings/string_piece.cc
@@ -0,0 +1,450 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// Copied from strings/stringpiece.cc with modifications + +#include "base/strings/string_piece.h" + +#include <limits.h> + +#include <algorithm> +#include <ostream> + +#include "polyfills/base/logging.h" +#include "base/strings/utf_string_conversions.h" + +namespace gurl_base { +namespace { + +// For each character in characters_wanted, sets the index corresponding +// to the ASCII code of that character to 1 in table. This is used by +// the find_.*_of methods below to tell whether or not a character is in +// the lookup table in constant time. +// The argument `table' must be an array that is large enough to hold all +// the possible values of an unsigned char. Thus it should be be declared +// as follows: +// bool table[UCHAR_MAX + 1] +inline void BuildLookupTable(const StringPiece& characters_wanted, + bool* table) { + const size_t length = characters_wanted.length(); + const char* const data = characters_wanted.data(); + for (size_t i = 0; i < length; ++i) { + table[static_cast<unsigned char>(data[i])] = true; + } +} + +} // namespace + +// MSVC doesn't like complex extern templates and DLLs. +#if !defined(COMPILER_MSVC) +template class BasicStringPiece<std::string>; +template class BasicStringPiece<string16>; +#endif + +std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { + o.write(piece.data(), static_cast<std::streamsize>(piece.size())); + return o; +} + +std::ostream& operator<<(std::ostream& o, const StringPiece16& piece) { + return o << UTF16ToUTF8(piece); +} + +namespace internal { + +template<typename STR> +void CopyToStringT(const BasicStringPiece<STR>& self, STR* target) { + if (self.empty()) + target->clear(); + else + target->assign(self.data(), self.size()); +} + +void CopyToString(const StringPiece& self, std::string* target) { + CopyToStringT(self, target); +} + +void CopyToString(const StringPiece16& self, string16* target) { + CopyToStringT(self, target); +} + +template<typename STR> +void AppendToStringT(const BasicStringPiece<STR>& self, STR* target) { + if (!self.empty()) + target->append(self.data(), self.size()); +} + +void AppendToString(const StringPiece& self, std::string* target) { + AppendToStringT(self, target); +} + +void AppendToString(const StringPiece16& self, string16* target) { + AppendToStringT(self, target); +} + +template<typename STR> +size_t copyT(const BasicStringPiece<STR>& self, + typename STR::value_type* buf, + size_t n, + size_t pos) { + size_t ret = std::min(self.size() - pos, n); + memcpy(buf, self.data() + pos, ret * sizeof(typename STR::value_type)); + return ret; +} + +size_t copy(const StringPiece& self, char* buf, size_t n, size_t pos) { + return copyT(self, buf, n, pos); +} + +size_t copy(const StringPiece16& self, char16* buf, size_t n, size_t pos) { + return copyT(self, buf, n, pos); +} + +template<typename STR> +size_t findT(const BasicStringPiece<STR>& self, + const BasicStringPiece<STR>& s, + size_t pos) { + if (pos > self.size()) + return BasicStringPiece<STR>::npos; + + typename BasicStringPiece<STR>::const_iterator result = + std::search(self.begin() + pos, self.end(), s.begin(), s.end()); + const size_t xpos = + static_cast<size_t>(result - self.begin()); + return xpos + s.size() <= self.size() ? xpos : BasicStringPiece<STR>::npos; +} + +size_t find(const StringPiece& self, const StringPiece& s, size_t pos) { + return findT(self, s, pos); +} + +size_t find(const StringPiece16& self, const StringPiece16& s, size_t pos) { + return findT(self, s, pos); +} + +template<typename STR> +size_t findT(const BasicStringPiece<STR>& self, + typename STR::value_type c, + size_t pos) { + if (pos >= self.size()) + return BasicStringPiece<STR>::npos; + + typename BasicStringPiece<STR>::const_iterator result = + std::find(self.begin() + pos, self.end(), c); + return result != self.end() ? + static_cast<size_t>(result - self.begin()) : BasicStringPiece<STR>::npos; +} + +size_t find(const StringPiece& self, char c, size_t pos) { + return findT(self, c, pos); +} + +size_t find(const StringPiece16& self, char16 c, size_t pos) { + return findT(self, c, pos); +} + +template<typename STR> +size_t rfindT(const BasicStringPiece<STR>& self, + const BasicStringPiece<STR>& s, + size_t pos) { + if (self.size() < s.size()) + return BasicStringPiece<STR>::npos; + + if (s.empty()) + return std::min(self.size(), pos); + + typename BasicStringPiece<STR>::const_iterator last = + self.begin() + std::min(self.size() - s.size(), pos) + s.size(); + typename BasicStringPiece<STR>::const_iterator result = + std::find_end(self.begin(), last, s.begin(), s.end()); + return result != last ? + static_cast<size_t>(result - self.begin()) : BasicStringPiece<STR>::npos; +} + +size_t rfind(const StringPiece& self, const StringPiece& s, size_t pos) { + return rfindT(self, s, pos); +} + +size_t rfind(const StringPiece16& self, const StringPiece16& s, size_t pos) { + return rfindT(self, s, pos); +} + +template<typename STR> +size_t rfindT(const BasicStringPiece<STR>& self, + typename STR::value_type c, + size_t pos) { + if (self.size() == 0) + return BasicStringPiece<STR>::npos; + + for (size_t i = std::min(pos, self.size() - 1); ; + --i) { + if (self.data()[i] == c) + return i; + if (i == 0) + break; + } + return BasicStringPiece<STR>::npos; +} + +size_t rfind(const StringPiece& self, char c, size_t pos) { + return rfindT(self, c, pos); +} + +size_t rfind(const StringPiece16& self, char16 c, size_t pos) { + return rfindT(self, c, pos); +} + +// 8-bit version using lookup table. +size_t find_first_of(const StringPiece& self, + const StringPiece& s, + size_t pos) { + if (self.size() == 0 || s.size() == 0) + return StringPiece::npos; + + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.size() == 1) + return find(self, s.data()[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (size_t i = pos; i < self.size(); ++i) { + if (lookup[static_cast<unsigned char>(self.data()[i])]) { + return i; + } + } + return StringPiece::npos; +} + +// 16-bit brute force version. +size_t find_first_of(const StringPiece16& self, + const StringPiece16& s, + size_t pos) { + StringPiece16::const_iterator found = + std::find_first_of(self.begin() + pos, self.end(), s.begin(), s.end()); + if (found == self.end()) + return StringPiece16::npos; + return found - self.begin(); +} + +// 8-bit version using lookup table. +size_t find_first_not_of(const StringPiece& self, + const StringPiece& s, + size_t pos) { + if (self.size() == 0) + return StringPiece::npos; + + if (s.size() == 0) + return 0; + + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.size() == 1) + return find_first_not_of(self, s.data()[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (size_t i = pos; i < self.size(); ++i) { + if (!lookup[static_cast<unsigned char>(self.data()[i])]) { + return i; + } + } + return StringPiece::npos; +} + +// 16-bit brute-force version. +BASE_EXPORT size_t find_first_not_of(const StringPiece16& self, + const StringPiece16& s, + size_t pos) { + if (self.size() == 0) + return StringPiece16::npos; + + for (size_t self_i = pos; self_i < self.size(); ++self_i) { + bool found = false; + for (auto c : s) { + if (self[self_i] == c) { + found = true; + break; + } + } + if (!found) + return self_i; + } + return StringPiece16::npos; +} + +template<typename STR> +size_t find_first_not_ofT(const BasicStringPiece<STR>& self, + typename STR::value_type c, + size_t pos) { + if (self.size() == 0) + return BasicStringPiece<STR>::npos; + + for (; pos < self.size(); ++pos) { + if (self.data()[pos] != c) { + return pos; + } + } + return BasicStringPiece<STR>::npos; +} + +size_t find_first_not_of(const StringPiece& self, + char c, + size_t pos) { + return find_first_not_ofT(self, c, pos); +} + +size_t find_first_not_of(const StringPiece16& self, + char16 c, + size_t pos) { + return find_first_not_ofT(self, c, pos); +} + +// 8-bit version using lookup table. +size_t find_last_of(const StringPiece& self, const StringPiece& s, size_t pos) { + if (self.size() == 0 || s.size() == 0) + return StringPiece::npos; + + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.size() == 1) + return rfind(self, s.data()[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (size_t i = std::min(pos, self.size() - 1); ; --i) { + if (lookup[static_cast<unsigned char>(self.data()[i])]) + return i; + if (i == 0) + break; + } + return StringPiece::npos; +} + +// 16-bit brute-force version. +size_t find_last_of(const StringPiece16& self, + const StringPiece16& s, + size_t pos) { + if (self.size() == 0) + return StringPiece16::npos; + + for (size_t self_i = std::min(pos, self.size() - 1); ; + --self_i) { + for (auto c : s) { + if (self.data()[self_i] == c) + return self_i; + } + if (self_i == 0) + break; + } + return StringPiece16::npos; +} + +// 8-bit version using lookup table. +size_t find_last_not_of(const StringPiece& self, + const StringPiece& s, + size_t pos) { + if (self.size() == 0) + return StringPiece::npos; + + size_t i = std::min(pos, self.size() - 1); + if (s.size() == 0) + return i; + + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.size() == 1) + return find_last_not_of(self, s.data()[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (; ; --i) { + if (!lookup[static_cast<unsigned char>(self.data()[i])]) + return i; + if (i == 0) + break; + } + return StringPiece::npos; +} + +// 16-bit brute-force version. +size_t find_last_not_of(const StringPiece16& self, + const StringPiece16& s, + size_t pos) { + if (self.size() == 0) + return StringPiece::npos; + + for (size_t self_i = std::min(pos, self.size() - 1); ; --self_i) { + bool found = false; + for (auto c : s) { + if (self.data()[self_i] == c) { + found = true; + break; + } + } + if (!found) + return self_i; + if (self_i == 0) + break; + } + return StringPiece16::npos; +} + +template<typename STR> +size_t find_last_not_ofT(const BasicStringPiece<STR>& self, + typename STR::value_type c, + size_t pos) { + if (self.size() == 0) + return BasicStringPiece<STR>::npos; + + for (size_t i = std::min(pos, self.size() - 1); ; --i) { + if (self.data()[i] != c) + return i; + if (i == 0) + break; + } + return BasicStringPiece<STR>::npos; +} + +size_t find_last_not_of(const StringPiece& self, + char c, + size_t pos) { + return find_last_not_ofT(self, c, pos); +} + +size_t find_last_not_of(const StringPiece16& self, + char16 c, + size_t pos) { + return find_last_not_ofT(self, c, pos); +} + +template<typename STR> +BasicStringPiece<STR> substrT(const BasicStringPiece<STR>& self, + size_t pos, + size_t n) { + if (pos > self.size()) pos = self.size(); + if (n > self.size() - pos) n = self.size() - pos; + return BasicStringPiece<STR>(self.data() + pos, n); +} + +StringPiece substr(const StringPiece& self, + size_t pos, + size_t n) { + return substrT(self, pos, n); +} + +StringPiece16 substr(const StringPiece16& self, + size_t pos, + size_t n) { + return substrT(self, pos, n); +} + +#if GURL_DCHECK_IS_ON() +void AssertIteratorsInOrder(std::string::const_iterator begin, + std::string::const_iterator end) { + GURL_DCHECK(begin <= end) << "StringPiece iterators swapped or invalid."; +} +void AssertIteratorsInOrder(string16::const_iterator begin, + string16::const_iterator end) { + GURL_DCHECK(begin <= end) << "StringPiece iterators swapped or invalid."; +} +#endif + +} // namespace internal +} // namespace base
diff --git a/base/strings/string_piece.h b/base/strings/string_piece.h new file mode 100644 index 0000000..5359af6 --- /dev/null +++ b/base/strings/string_piece.h
@@ -0,0 +1,548 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// Copied from strings/stringpiece.h with modifications +// +// A string-like object that points to a sized piece of memory. +// +// You can use StringPiece as a function or method parameter. A StringPiece +// parameter can receive a double-quoted string literal argument, a "const +// char*" argument, a string argument, or a StringPiece argument with no data +// copying. Systematic use of StringPiece for arguments reduces data +// copies and strlen() calls. +// +// Prefer passing StringPieces by value: +// void MyFunction(StringPiece arg); +// If circumstances require, you may also pass by const reference: +// void MyFunction(const StringPiece& arg); // not preferred +// Both of these have the same lifetime semantics. Passing by value +// generates slightly smaller code. For more discussion, Googlers can see +// the thread go/stringpiecebyvalue on c-users. + +#ifndef BASE_STRINGS_STRING_PIECE_H_ +#define BASE_STRINGS_STRING_PIECE_H_ + +#include <stddef.h> + +#include <iosfwd> +#include <string> +#include <type_traits> + +#include "polyfills/base/base_export.h" +#include "polyfills/base/logging.h" +#include "base/strings/char_traits.h" +#include "base/strings/string16.h" +#include "base/strings/string_piece_forward.h" + +namespace gurl_base { + +// internal -------------------------------------------------------------------- + +// Many of the StringPiece functions use different implementations for the +// 8-bit and 16-bit versions, and we don't want lots of template expansions in +// this (very common) header that will slow down compilation. +// +// So here we define overloaded functions called by the StringPiece template. +// For those that share an implementation, the two versions will expand to a +// template internal to the .cc file. +namespace internal { + +BASE_EXPORT void CopyToString(const StringPiece& self, std::string* target); +BASE_EXPORT void CopyToString(const StringPiece16& self, string16* target); + +BASE_EXPORT void AppendToString(const StringPiece& self, std::string* target); +BASE_EXPORT void AppendToString(const StringPiece16& self, string16* target); + +BASE_EXPORT size_t copy(const StringPiece& self, + char* buf, + size_t n, + size_t pos); +BASE_EXPORT size_t copy(const StringPiece16& self, + char16* buf, + size_t n, + size_t pos); + +BASE_EXPORT size_t find(const StringPiece& self, + const StringPiece& s, + size_t pos); +BASE_EXPORT size_t find(const StringPiece16& self, + const StringPiece16& s, + size_t pos); +BASE_EXPORT size_t find(const StringPiece& self, + char c, + size_t pos); +BASE_EXPORT size_t find(const StringPiece16& self, + char16 c, + size_t pos); + +BASE_EXPORT size_t rfind(const StringPiece& self, + const StringPiece& s, + size_t pos); +BASE_EXPORT size_t rfind(const StringPiece16& self, + const StringPiece16& s, + size_t pos); +BASE_EXPORT size_t rfind(const StringPiece& self, + char c, + size_t pos); +BASE_EXPORT size_t rfind(const StringPiece16& self, + char16 c, + size_t pos); + +BASE_EXPORT size_t find_first_of(const StringPiece& self, + const StringPiece& s, + size_t pos); +BASE_EXPORT size_t find_first_of(const StringPiece16& self, + const StringPiece16& s, + size_t pos); + +BASE_EXPORT size_t find_first_not_of(const StringPiece& self, + const StringPiece& s, + size_t pos); +BASE_EXPORT size_t find_first_not_of(const StringPiece16& self, + const StringPiece16& s, + size_t pos); +BASE_EXPORT size_t find_first_not_of(const StringPiece& self, + char c, + size_t pos); +BASE_EXPORT size_t find_first_not_of(const StringPiece16& self, + char16 c, + size_t pos); + +BASE_EXPORT size_t find_last_of(const StringPiece& self, + const StringPiece& s, + size_t pos); +BASE_EXPORT size_t find_last_of(const StringPiece16& self, + const StringPiece16& s, + size_t pos); +BASE_EXPORT size_t find_last_of(const StringPiece& self, + char c, + size_t pos); +BASE_EXPORT size_t find_last_of(const StringPiece16& self, + char16 c, + size_t pos); + +BASE_EXPORT size_t find_last_not_of(const StringPiece& self, + const StringPiece& s, + size_t pos); +BASE_EXPORT size_t find_last_not_of(const StringPiece16& self, + const StringPiece16& s, + size_t pos); +BASE_EXPORT size_t find_last_not_of(const StringPiece16& self, + char16 c, + size_t pos); +BASE_EXPORT size_t find_last_not_of(const StringPiece& self, + char c, + size_t pos); + +BASE_EXPORT StringPiece substr(const StringPiece& self, + size_t pos, + size_t n); +BASE_EXPORT StringPiece16 substr(const StringPiece16& self, + size_t pos, + size_t n); + +#if GURL_DCHECK_IS_ON() +// Asserts that begin <= end to catch some errors with iterator usage. +BASE_EXPORT void AssertIteratorsInOrder(std::string::const_iterator begin, + std::string::const_iterator end); +BASE_EXPORT void AssertIteratorsInOrder(string16::const_iterator begin, + string16::const_iterator end); +#endif + +} // namespace internal + +// BasicStringPiece ------------------------------------------------------------ + +// Defines the types, methods, operators, and data members common to both +// StringPiece and StringPiece16. Do not refer to this class directly, but +// rather to BasicStringPiece, StringPiece, or StringPiece16. +// +// This is templatized by string class type rather than character type, so +// BasicStringPiece<std::string> or BasicStringPiece<gurl_base::string16>. +template <typename STRING_TYPE> class BasicStringPiece { + public: + // Standard STL container boilerplate. + typedef size_t size_type; + typedef typename STRING_TYPE::value_type value_type; + typedef const value_type* pointer; + typedef const value_type& reference; + typedef const value_type& const_reference; + typedef ptrdiff_t difference_type; + typedef const value_type* const_iterator; + typedef std::reverse_iterator<const_iterator> const_reverse_iterator; + + static const size_type npos; + + public: + // We provide non-explicit singleton constructors so users can pass + // in a "const char*" or a "string" wherever a "StringPiece" is + // expected (likewise for char16, string16, StringPiece16). + constexpr BasicStringPiece() : ptr_(NULL), length_(0) {} + // TODO(dcheng): Construction from nullptr is not allowed for + // std::basic_string_view, so remove the special handling for it. + // Note: This doesn't just use STRING_TYPE::traits_type::length(), since that + // isn't constexpr until C++17. + constexpr BasicStringPiece(const value_type* str) + : ptr_(str), length_(!str ? 0 : CharTraits<value_type>::length(str)) {} + BasicStringPiece(const STRING_TYPE& str) + : ptr_(str.data()), length_(str.size()) {} + constexpr BasicStringPiece(const value_type* offset, size_type len) + : ptr_(offset), length_(len) {} + BasicStringPiece(const typename STRING_TYPE::const_iterator& begin, + const typename STRING_TYPE::const_iterator& end) { +#if GURL_DCHECK_IS_ON() + // This assertion is done out-of-line to avoid bringing in logging.h and + // instantiating logging macros for every instantiation. + internal::AssertIteratorsInOrder(begin, end); +#endif + length_ = static_cast<size_t>(std::distance(begin, end)); + + // The length test before assignment is to avoid dereferencing an iterator + // that may point to the end() of a string. + ptr_ = length_ > 0 ? &*begin : nullptr; + } + + // data() may return a pointer to a buffer with embedded NULs, and the + // returned buffer may or may not be null terminated. Therefore it is + // typically a mistake to pass data() to a routine that expects a NUL + // terminated string. + constexpr const value_type* data() const { return ptr_; } + constexpr size_type size() const noexcept { return length_; } + constexpr size_type length() const noexcept { return length_; } + bool empty() const { return length_ == 0; } + + void clear() { + ptr_ = NULL; + length_ = 0; + } + void set(const value_type* data, size_type len) { + ptr_ = data; + length_ = len; + } + void set(const value_type* str) { + ptr_ = str; + length_ = str ? STRING_TYPE::traits_type::length(str) : 0; + } + + constexpr value_type operator[](size_type i) const { + GURL_CHECK(i < length_); + return ptr_[i]; + } + + value_type front() const { + GURL_CHECK_NE(0UL, length_); + return ptr_[0]; + } + + value_type back() const { + GURL_CHECK_NE(0UL, length_); + return ptr_[length_ - 1]; + } + + constexpr void remove_prefix(size_type n) { + GURL_CHECK(n <= length_); + ptr_ += n; + length_ -= n; + } + + constexpr void remove_suffix(size_type n) { + GURL_CHECK(n <= length_); + length_ -= n; + } + + constexpr int compare(BasicStringPiece x) const noexcept { + int r = CharTraits<value_type>::compare( + ptr_, x.ptr_, (length_ < x.length_ ? length_ : x.length_)); + if (r == 0) { + if (length_ < x.length_) r = -1; + else if (length_ > x.length_) r = +1; + } + return r; + } + + // This is the style of conversion preferred by std::string_view in C++17. + explicit operator STRING_TYPE() const { return as_string(); } + + STRING_TYPE as_string() const { + // std::string doesn't like to take a NULL pointer even with a 0 size. + return empty() ? STRING_TYPE() : STRING_TYPE(data(), size()); + } + + const_iterator begin() const { return ptr_; } + const_iterator end() const { return ptr_ + length_; } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(ptr_ + length_); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(ptr_); + } + + size_type max_size() const { return length_; } + size_type capacity() const { return length_; } + + // Sets the value of the given string target type to be the current string. + // This saves a temporary over doing |a = b.as_string()| + void CopyToString(STRING_TYPE* target) const { + internal::CopyToString(*this, target); + } + + void AppendToString(STRING_TYPE* target) const { + internal::AppendToString(*this, target); + } + + size_type copy(value_type* buf, size_type n, size_type pos = 0) const { + return internal::copy(*this, buf, n, pos); + } + + // Does "this" start with "x" + constexpr bool starts_with(BasicStringPiece x) const noexcept { + return ( + (this->length_ >= x.length_) && + (CharTraits<value_type>::compare(this->ptr_, x.ptr_, x.length_) == 0)); + } + + // Does "this" end with "x" + constexpr bool ends_with(BasicStringPiece x) const noexcept { + return ((this->length_ >= x.length_) && + (CharTraits<value_type>::compare( + this->ptr_ + (this->length_ - x.length_), x.ptr_, x.length_) == + 0)); + } + + // find: Search for a character or substring at a given offset. + size_type find(const BasicStringPiece<STRING_TYPE>& s, + size_type pos = 0) const { + return internal::find(*this, s, pos); + } + size_type find(value_type c, size_type pos = 0) const { + return internal::find(*this, c, pos); + } + + // rfind: Reverse find. + size_type rfind(const BasicStringPiece& s, + size_type pos = BasicStringPiece::npos) const { + return internal::rfind(*this, s, pos); + } + size_type rfind(value_type c, size_type pos = BasicStringPiece::npos) const { + return internal::rfind(*this, c, pos); + } + + // find_first_of: Find the first occurence of one of a set of characters. + size_type find_first_of(const BasicStringPiece& s, + size_type pos = 0) const { + return internal::find_first_of(*this, s, pos); + } + size_type find_first_of(value_type c, size_type pos = 0) const { + return find(c, pos); + } + + // find_first_not_of: Find the first occurence not of a set of characters. + size_type find_first_not_of(const BasicStringPiece& s, + size_type pos = 0) const { + return internal::find_first_not_of(*this, s, pos); + } + size_type find_first_not_of(value_type c, size_type pos = 0) const { + return internal::find_first_not_of(*this, c, pos); + } + + // find_last_of: Find the last occurence of one of a set of characters. + size_type find_last_of(const BasicStringPiece& s, + size_type pos = BasicStringPiece::npos) const { + return internal::find_last_of(*this, s, pos); + } + size_type find_last_of(value_type c, + size_type pos = BasicStringPiece::npos) const { + return rfind(c, pos); + } + + // find_last_not_of: Find the last occurence not of a set of characters. + size_type find_last_not_of(const BasicStringPiece& s, + size_type pos = BasicStringPiece::npos) const { + return internal::find_last_not_of(*this, s, pos); + } + size_type find_last_not_of(value_type c, + size_type pos = BasicStringPiece::npos) const { + return internal::find_last_not_of(*this, c, pos); + } + + // substr. + BasicStringPiece substr(size_type pos, + size_type n = BasicStringPiece::npos) const { + return internal::substr(*this, pos, n); + } + + protected: + const value_type* ptr_; + size_type length_; +}; + +template <typename STRING_TYPE> +const typename BasicStringPiece<STRING_TYPE>::size_type +BasicStringPiece<STRING_TYPE>::npos = + typename BasicStringPiece<STRING_TYPE>::size_type(-1); + +// MSVC doesn't like complex extern templates and DLLs. +#if !defined(COMPILER_MSVC) +extern template class BASE_EXPORT BasicStringPiece<std::string>; +extern template class BASE_EXPORT BasicStringPiece<string16>; +#endif + +// Comparison operators -------------------------------------------------------- +// operator == +template <typename StringT> +constexpr bool operator==(BasicStringPiece<StringT> lhs, + BasicStringPiece<StringT> rhs) noexcept { + return lhs.size() == rhs.size() && lhs.compare(rhs) == 0; +} + +// Here and below we make use of std::common_type_t to emulate an identity type +// transformation. This creates a non-deduced context, so that we can compare +// StringPieces with types that implicitly convert to StringPieces. See +// https://wg21.link/n3766 for details. +// Furthermore, we require dummy template parameters for these overloads to work +// around a name mangling issue on Windows. +template <typename StringT, int = 1> +constexpr bool operator==( + BasicStringPiece<StringT> lhs, + std::common_type_t<BasicStringPiece<StringT>> rhs) noexcept { + return lhs.size() == rhs.size() && lhs.compare(rhs) == 0; +} + +template <typename StringT, int = 2> +constexpr bool operator==(std::common_type_t<BasicStringPiece<StringT>> lhs, + BasicStringPiece<StringT> rhs) noexcept { + return lhs.size() == rhs.size() && lhs.compare(rhs) == 0; +} + +// operator != +template <typename StringT> +constexpr bool operator!=(BasicStringPiece<StringT> lhs, + BasicStringPiece<StringT> rhs) noexcept { + return !(lhs == rhs); +} + +template <typename StringT, int = 1> +constexpr bool operator!=( + BasicStringPiece<StringT> lhs, + std::common_type_t<BasicStringPiece<StringT>> rhs) noexcept { + return !(lhs == rhs); +} + +template <typename StringT, int = 2> +constexpr bool operator!=(std::common_type_t<BasicStringPiece<StringT>> lhs, + BasicStringPiece<StringT> rhs) noexcept { + return !(lhs == rhs); +} + +// operator < +template <typename StringT> +constexpr bool operator<(BasicStringPiece<StringT> lhs, + BasicStringPiece<StringT> rhs) noexcept { + return lhs.compare(rhs) < 0; +} + +template <typename StringT, int = 1> +constexpr bool operator<( + BasicStringPiece<StringT> lhs, + std::common_type_t<BasicStringPiece<StringT>> rhs) noexcept { + return lhs.compare(rhs) < 0; +} + +template <typename StringT, int = 2> +constexpr bool operator<(std::common_type_t<BasicStringPiece<StringT>> lhs, + BasicStringPiece<StringT> rhs) noexcept { + return lhs.compare(rhs) < 0; +} + +// operator > +template <typename StringT> +constexpr bool operator>(BasicStringPiece<StringT> lhs, + BasicStringPiece<StringT> rhs) noexcept { + return rhs < lhs; +} + +template <typename StringT, int = 1> +constexpr bool operator>( + BasicStringPiece<StringT> lhs, + std::common_type_t<BasicStringPiece<StringT>> rhs) noexcept { + return rhs < lhs; +} + +template <typename StringT, int = 2> +constexpr bool operator>(std::common_type_t<BasicStringPiece<StringT>> lhs, + BasicStringPiece<StringT> rhs) noexcept { + return rhs < lhs; +} + +// operator <= +template <typename StringT> +constexpr bool operator<=(BasicStringPiece<StringT> lhs, + BasicStringPiece<StringT> rhs) noexcept { + return !(rhs < lhs); +} + +template <typename StringT, int = 1> +constexpr bool operator<=( + BasicStringPiece<StringT> lhs, + std::common_type_t<BasicStringPiece<StringT>> rhs) noexcept { + return !(rhs < lhs); +} + +template <typename StringT, int = 2> +constexpr bool operator<=(std::common_type_t<BasicStringPiece<StringT>> lhs, + BasicStringPiece<StringT> rhs) noexcept { + return !(rhs < lhs); +} + +// operator >= +template <typename StringT> +constexpr bool operator>=(BasicStringPiece<StringT> lhs, + BasicStringPiece<StringT> rhs) noexcept { + return !(lhs < rhs); +} + +template <typename StringT, int = 1> +constexpr bool operator>=( + BasicStringPiece<StringT> lhs, + std::common_type_t<BasicStringPiece<StringT>> rhs) noexcept { + return !(lhs < rhs); +} + +template <typename StringT, int = 2> +constexpr bool operator>=(std::common_type_t<BasicStringPiece<StringT>> lhs, + BasicStringPiece<StringT> rhs) noexcept { + return !(lhs < rhs); +} + +BASE_EXPORT std::ostream& operator<<(std::ostream& o, + const StringPiece& piece); + +BASE_EXPORT std::ostream& operator<<(std::ostream& o, + const StringPiece16& piece); + +// Hashing --------------------------------------------------------------------- + +// We provide appropriate hash functions so StringPiece and StringPiece16 can +// be used as keys in hash sets and maps. + +// This hash function is copied from base/strings/string16.h. We don't use the +// ones already defined for string and string16 directly because it would +// require the string constructors to be called, which we don't want. + +template <typename StringPieceType> +struct StringPieceHashImpl { + std::size_t operator()(StringPieceType sp) const { + std::size_t result = 0; + for (auto c : sp) + result = (result * 131) + c; + return result; + } +}; + +using StringPieceHash = StringPieceHashImpl<StringPiece>; +using StringPiece16Hash = StringPieceHashImpl<StringPiece16>; +using WStringPieceHash = StringPieceHashImpl<WStringPiece>; + +} // namespace base + +#endif // BASE_STRINGS_STRING_PIECE_H_
diff --git a/base/strings/string_piece_forward.h b/base/strings/string_piece_forward.h new file mode 100644 index 0000000..aa79117 --- /dev/null +++ b/base/strings/string_piece_forward.h
@@ -0,0 +1,24 @@ +// Copyright 2017 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Forward declaration of StringPiece types from base/strings/string_piece.h + +#ifndef BASE_STRINGS_STRING_PIECE_FORWARD_H_ +#define BASE_STRINGS_STRING_PIECE_FORWARD_H_ + +#include <string> + +#include "base/strings/string16.h" + +namespace gurl_base { + +template <typename STRING_TYPE> +class BasicStringPiece; +typedef BasicStringPiece<std::string> StringPiece; +typedef BasicStringPiece<string16> StringPiece16; +typedef BasicStringPiece<std::wstring> WStringPiece; + +} // namespace base + +#endif // BASE_STRINGS_STRING_PIECE_FORWARD_H_
diff --git a/base/strings/string_piece_unittest.cc b/base/strings/string_piece_unittest.cc new file mode 100644 index 0000000..8e245e6 --- /dev/null +++ b/base/strings/string_piece_unittest.cc
@@ -0,0 +1,838 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <stddef.h> + +#include <string> + +#include "base/strings/string16.h" +#include "base/strings/string_piece.h" +#include "base/strings/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace gurl_base { + +template <typename T> +class CommonStringPieceTest : public ::testing::Test { + public: + static const T as_string(const char* input) { + return T(input); + } + static const T& as_string(const T& input) { + return input; + } +}; + +template <> +class CommonStringPieceTest<string16> : public ::testing::Test { + public: + static const string16 as_string(const char* input) { + return ASCIIToUTF16(input); + } + static const string16 as_string(const std::string& input) { + return ASCIIToUTF16(input); + } +}; + +typedef ::testing::Types<std::string, string16> SupportedStringTypes; + +TYPED_TEST_SUITE(CommonStringPieceTest, SupportedStringTypes); + +TYPED_TEST(CommonStringPieceTest, CheckComparisonOperators) { +#define CMP_Y(op, x, y) \ + { \ + TypeParam lhs(TestFixture::as_string(x)); \ + TypeParam rhs(TestFixture::as_string(y)); \ + ASSERT_TRUE((BasicStringPiece<TypeParam>((lhs.c_str())) \ + op BasicStringPiece<TypeParam>((rhs.c_str())))); \ + ASSERT_TRUE(BasicStringPiece<TypeParam>(lhs) op rhs); \ + ASSERT_TRUE(lhs op BasicStringPiece<TypeParam>(rhs)); \ + ASSERT_TRUE((BasicStringPiece<TypeParam>((lhs.c_str())) \ + .compare(BasicStringPiece<TypeParam>((rhs.c_str()))) \ + op 0)); \ + } + +#define CMP_N(op, x, y) \ + { \ + TypeParam lhs(TestFixture::as_string(x)); \ + TypeParam rhs(TestFixture::as_string(y)); \ + ASSERT_FALSE((BasicStringPiece<TypeParam>((lhs.c_str())) \ + op BasicStringPiece<TypeParam>((rhs.c_str())))); \ + ASSERT_FALSE(BasicStringPiece<TypeParam>(lhs) op rhs); \ + ASSERT_FALSE(lhs op BasicStringPiece<TypeParam>(rhs)); \ + ASSERT_FALSE((BasicStringPiece<TypeParam>((lhs.c_str())) \ + .compare(BasicStringPiece<TypeParam>((rhs.c_str()))) \ + op 0)); \ + } + + CMP_Y(==, "", "") + CMP_Y(==, "a", "a") + CMP_Y(==, "aa", "aa") + CMP_N(==, "a", "") + CMP_N(==, "", "a") + CMP_N(==, "a", "b") + CMP_N(==, "a", "aa") + CMP_N(==, "aa", "a") + + CMP_N(!=, "", "") + CMP_N(!=, "a", "a") + CMP_N(!=, "aa", "aa") + CMP_Y(!=, "a", "") + CMP_Y(!=, "", "a") + CMP_Y(!=, "a", "b") + CMP_Y(!=, "a", "aa") + CMP_Y(!=, "aa", "a") + + CMP_Y(<, "a", "b") + CMP_Y(<, "a", "aa") + CMP_Y(<, "aa", "b") + CMP_Y(<, "aa", "bb") + CMP_N(<, "a", "a") + CMP_N(<, "b", "a") + CMP_N(<, "aa", "a") + CMP_N(<, "b", "aa") + CMP_N(<, "bb", "aa") + + CMP_Y(<=, "a", "a") + CMP_Y(<=, "a", "b") + CMP_Y(<=, "a", "aa") + CMP_Y(<=, "aa", "b") + CMP_Y(<=, "aa", "bb") + CMP_N(<=, "b", "a") + CMP_N(<=, "aa", "a") + CMP_N(<=, "b", "aa") + CMP_N(<=, "bb", "aa") + + CMP_N(>=, "a", "b") + CMP_N(>=, "a", "aa") + CMP_N(>=, "aa", "b") + CMP_N(>=, "aa", "bb") + CMP_Y(>=, "a", "a") + CMP_Y(>=, "b", "a") + CMP_Y(>=, "aa", "a") + CMP_Y(>=, "b", "aa") + CMP_Y(>=, "bb", "aa") + + CMP_N(>, "a", "a") + CMP_N(>, "a", "b") + CMP_N(>, "a", "aa") + CMP_N(>, "aa", "b") + CMP_N(>, "aa", "bb") + CMP_Y(>, "b", "a") + CMP_Y(>, "aa", "a") + CMP_Y(>, "b", "aa") + CMP_Y(>, "bb", "aa") + + std::string x; + for (int i = 0; i < 256; i++) { + x += 'a'; + std::string y = x; + CMP_Y(==, x, y); + for (int j = 0; j < i; j++) { + std::string z = x; + z[j] = 'b'; // Differs in position 'j' + CMP_N(==, x, z); + } + } + +#undef CMP_Y +#undef CMP_N +} + +TYPED_TEST(CommonStringPieceTest, CheckSTL) { + TypeParam alphabet(TestFixture::as_string("abcdefghijklmnopqrstuvwxyz")); + TypeParam abc(TestFixture::as_string("abc")); + TypeParam xyz(TestFixture::as_string("xyz")); + TypeParam foobar(TestFixture::as_string("foobar")); + + BasicStringPiece<TypeParam> a(alphabet); + BasicStringPiece<TypeParam> b(abc); + BasicStringPiece<TypeParam> c(xyz); + BasicStringPiece<TypeParam> d(foobar); + BasicStringPiece<TypeParam> e; + TypeParam temp(TestFixture::as_string("123")); + temp += static_cast<typename TypeParam::value_type>(0); + temp += TestFixture::as_string("456"); + BasicStringPiece<TypeParam> f(temp); + + ASSERT_EQ(a[6], static_cast<typename TypeParam::value_type>('g')); + ASSERT_EQ(b[0], static_cast<typename TypeParam::value_type>('a')); + ASSERT_EQ(c[2], static_cast<typename TypeParam::value_type>('z')); + ASSERT_EQ(f[3], static_cast<typename TypeParam::value_type>('\0')); + ASSERT_EQ(f[5], static_cast<typename TypeParam::value_type>('5')); + + ASSERT_EQ(*d.data(), static_cast<typename TypeParam::value_type>('f')); + ASSERT_EQ(d.data()[5], static_cast<typename TypeParam::value_type>('r')); + ASSERT_EQ(e.data(), nullptr); + + ASSERT_EQ(*a.begin(), static_cast<typename TypeParam::value_type>('a')); + ASSERT_EQ(*(b.begin() + 2), static_cast<typename TypeParam::value_type>('c')); + ASSERT_EQ(*(c.end() - 1), static_cast<typename TypeParam::value_type>('z')); + + ASSERT_EQ(*a.rbegin(), static_cast<typename TypeParam::value_type>('z')); + ASSERT_EQ(*(b.rbegin() + 2), + static_cast<typename TypeParam::value_type>('a')); + ASSERT_EQ(*(c.rend() - 1), static_cast<typename TypeParam::value_type>('x')); + ASSERT_EQ(a.rbegin() + 26, a.rend()); + + ASSERT_EQ(a.size(), 26U); + ASSERT_EQ(b.size(), 3U); + ASSERT_EQ(c.size(), 3U); + ASSERT_EQ(d.size(), 6U); + ASSERT_EQ(e.size(), 0U); + ASSERT_EQ(f.size(), 7U); + + ASSERT_TRUE(!d.empty()); + ASSERT_TRUE(d.begin() != d.end()); + ASSERT_EQ(d.begin() + 6, d.end()); + + ASSERT_TRUE(e.empty()); + ASSERT_EQ(e.begin(), e.end()); + + d.clear(); + ASSERT_EQ(d.size(), 0U); + ASSERT_TRUE(d.empty()); + ASSERT_EQ(d.data(), nullptr); + ASSERT_EQ(d.begin(), d.end()); + + ASSERT_GE(a.max_size(), a.capacity()); + ASSERT_GE(a.capacity(), a.size()); +} + +TYPED_TEST(CommonStringPieceTest, CheckFind) { + typedef BasicStringPiece<TypeParam> Piece; + + TypeParam alphabet(TestFixture::as_string("abcdefghijklmnopqrstuvwxyz")); + TypeParam abc(TestFixture::as_string("abc")); + TypeParam xyz(TestFixture::as_string("xyz")); + TypeParam foobar(TestFixture::as_string("foobar")); + + BasicStringPiece<TypeParam> a(alphabet); + BasicStringPiece<TypeParam> b(abc); + BasicStringPiece<TypeParam> c(xyz); + BasicStringPiece<TypeParam> d(foobar); + + d.clear(); + Piece e; + TypeParam temp(TestFixture::as_string("123")); + temp.push_back('\0'); + temp += TestFixture::as_string("456"); + Piece f(temp); + + typename TypeParam::value_type buf[4] = { '%', '%', '%', '%' }; + ASSERT_EQ(a.copy(buf, 4), 4U); + ASSERT_EQ(buf[0], a[0]); + ASSERT_EQ(buf[1], a[1]); + ASSERT_EQ(buf[2], a[2]); + ASSERT_EQ(buf[3], a[3]); + ASSERT_EQ(a.copy(buf, 3, 7), 3U); + ASSERT_EQ(buf[0], a[7]); + ASSERT_EQ(buf[1], a[8]); + ASSERT_EQ(buf[2], a[9]); + ASSERT_EQ(buf[3], a[3]); + ASSERT_EQ(c.copy(buf, 99), 3U); + ASSERT_EQ(buf[0], c[0]); + ASSERT_EQ(buf[1], c[1]); + ASSERT_EQ(buf[2], c[2]); + ASSERT_EQ(buf[3], a[3]); + + ASSERT_EQ(Piece::npos, TypeParam::npos); + + ASSERT_EQ(a.find(b), 0U); + ASSERT_EQ(a.find(b, 1), Piece::npos); + ASSERT_EQ(a.find(c), 23U); + ASSERT_EQ(a.find(c, 9), 23U); + ASSERT_EQ(a.find(c, Piece::npos), Piece::npos); + ASSERT_EQ(b.find(c), Piece::npos); + ASSERT_EQ(b.find(c, Piece::npos), Piece::npos); + ASSERT_EQ(a.find(d), 0U); + ASSERT_EQ(a.find(e), 0U); + ASSERT_EQ(a.find(d, 12), 12U); + ASSERT_EQ(a.find(e, 17), 17U); + TypeParam not_found(TestFixture::as_string("xx not found bb")); + Piece g(not_found); + ASSERT_EQ(a.find(g), Piece::npos); + // empty string nonsense + ASSERT_EQ(d.find(b), Piece::npos); + ASSERT_EQ(e.find(b), Piece::npos); + ASSERT_EQ(d.find(b, 4), Piece::npos); + ASSERT_EQ(e.find(b, 7), Piece::npos); + + size_t empty_search_pos = TypeParam().find(TypeParam()); + ASSERT_EQ(d.find(d), empty_search_pos); + ASSERT_EQ(d.find(e), empty_search_pos); + ASSERT_EQ(e.find(d), empty_search_pos); + ASSERT_EQ(e.find(e), empty_search_pos); + ASSERT_EQ(d.find(d, 4), std::string().find(std::string(), 4)); + ASSERT_EQ(d.find(e, 4), std::string().find(std::string(), 4)); + ASSERT_EQ(e.find(d, 4), std::string().find(std::string(), 4)); + ASSERT_EQ(e.find(e, 4), std::string().find(std::string(), 4)); + + ASSERT_EQ(a.find('a'), 0U); + ASSERT_EQ(a.find('c'), 2U); + ASSERT_EQ(a.find('z'), 25U); + ASSERT_EQ(a.find('$'), Piece::npos); + ASSERT_EQ(a.find('\0'), Piece::npos); + ASSERT_EQ(f.find('\0'), 3U); + ASSERT_EQ(f.find('3'), 2U); + ASSERT_EQ(f.find('5'), 5U); + ASSERT_EQ(g.find('o'), 4U); + ASSERT_EQ(g.find('o', 4), 4U); + ASSERT_EQ(g.find('o', 5), 8U); + ASSERT_EQ(a.find('b', 5), Piece::npos); + // empty string nonsense + ASSERT_EQ(d.find('\0'), Piece::npos); + ASSERT_EQ(e.find('\0'), Piece::npos); + ASSERT_EQ(d.find('\0', 4), Piece::npos); + ASSERT_EQ(e.find('\0', 7), Piece::npos); + ASSERT_EQ(d.find('x'), Piece::npos); + ASSERT_EQ(e.find('x'), Piece::npos); + ASSERT_EQ(d.find('x', 4), Piece::npos); + ASSERT_EQ(e.find('x', 7), Piece::npos); + + ASSERT_EQ(a.rfind(b), 0U); + ASSERT_EQ(a.rfind(b, 1), 0U); + ASSERT_EQ(a.rfind(c), 23U); + ASSERT_EQ(a.rfind(c, 22U), Piece::npos); + ASSERT_EQ(a.rfind(c, 1U), Piece::npos); + ASSERT_EQ(a.rfind(c, 0U), Piece::npos); + ASSERT_EQ(b.rfind(c), Piece::npos); + ASSERT_EQ(b.rfind(c, 0U), Piece::npos); + ASSERT_EQ(a.rfind(d), static_cast<size_t>(a.as_string().rfind(TypeParam()))); + ASSERT_EQ(a.rfind(e), a.as_string().rfind(TypeParam())); + ASSERT_EQ(a.rfind(d), static_cast<size_t>(TypeParam(a).rfind(TypeParam()))); + ASSERT_EQ(a.rfind(e), TypeParam(a).rfind(TypeParam())); + ASSERT_EQ(a.rfind(d, 12), 12U); + ASSERT_EQ(a.rfind(e, 17), 17U); + ASSERT_EQ(a.rfind(g), Piece::npos); + ASSERT_EQ(d.rfind(b), Piece::npos); + ASSERT_EQ(e.rfind(b), Piece::npos); + ASSERT_EQ(d.rfind(b, 4), Piece::npos); + ASSERT_EQ(e.rfind(b, 7), Piece::npos); + // empty string nonsense + ASSERT_EQ(d.rfind(d, 4), std::string().rfind(std::string())); + ASSERT_EQ(e.rfind(d, 7), std::string().rfind(std::string())); + ASSERT_EQ(d.rfind(e, 4), std::string().rfind(std::string())); + ASSERT_EQ(e.rfind(e, 7), std::string().rfind(std::string())); + ASSERT_EQ(d.rfind(d), std::string().rfind(std::string())); + ASSERT_EQ(e.rfind(d), std::string().rfind(std::string())); + ASSERT_EQ(d.rfind(e), std::string().rfind(std::string())); + ASSERT_EQ(e.rfind(e), std::string().rfind(std::string())); + + ASSERT_EQ(g.rfind('o'), 8U); + ASSERT_EQ(g.rfind('q'), Piece::npos); + ASSERT_EQ(g.rfind('o', 8), 8U); + ASSERT_EQ(g.rfind('o', 7), 4U); + ASSERT_EQ(g.rfind('o', 3), Piece::npos); + ASSERT_EQ(f.rfind('\0'), 3U); + ASSERT_EQ(f.rfind('\0', 12), 3U); + ASSERT_EQ(f.rfind('3'), 2U); + ASSERT_EQ(f.rfind('5'), 5U); + // empty string nonsense + ASSERT_EQ(d.rfind('o'), Piece::npos); + ASSERT_EQ(e.rfind('o'), Piece::npos); + ASSERT_EQ(d.rfind('o', 4), Piece::npos); + ASSERT_EQ(e.rfind('o', 7), Piece::npos); + + TypeParam one_two_three_four(TestFixture::as_string("one,two:three;four")); + TypeParam comma_colon(TestFixture::as_string(",:")); + ASSERT_EQ(3U, Piece(one_two_three_four).find_first_of(comma_colon)); + ASSERT_EQ(a.find_first_of(b), 0U); + ASSERT_EQ(a.find_first_of(b, 0), 0U); + ASSERT_EQ(a.find_first_of(b, 1), 1U); + ASSERT_EQ(a.find_first_of(b, 2), 2U); + ASSERT_EQ(a.find_first_of(b, 3), Piece::npos); + ASSERT_EQ(a.find_first_of(c), 23U); + ASSERT_EQ(a.find_first_of(c, 23), 23U); + ASSERT_EQ(a.find_first_of(c, 24), 24U); + ASSERT_EQ(a.find_first_of(c, 25), 25U); + ASSERT_EQ(a.find_first_of(c, 26), Piece::npos); + ASSERT_EQ(g.find_first_of(b), 13U); + ASSERT_EQ(g.find_first_of(c), 0U); + ASSERT_EQ(a.find_first_of(f), Piece::npos); + ASSERT_EQ(f.find_first_of(a), Piece::npos); + // empty string nonsense + ASSERT_EQ(a.find_first_of(d), Piece::npos); + ASSERT_EQ(a.find_first_of(e), Piece::npos); + ASSERT_EQ(d.find_first_of(b), Piece::npos); + ASSERT_EQ(e.find_first_of(b), Piece::npos); + ASSERT_EQ(d.find_first_of(d), Piece::npos); + ASSERT_EQ(e.find_first_of(d), Piece::npos); + ASSERT_EQ(d.find_first_of(e), Piece::npos); + ASSERT_EQ(e.find_first_of(e), Piece::npos); + + ASSERT_EQ(a.find_first_not_of(b), 3U); + ASSERT_EQ(a.find_first_not_of(c), 0U); + ASSERT_EQ(b.find_first_not_of(a), Piece::npos); + ASSERT_EQ(c.find_first_not_of(a), Piece::npos); + ASSERT_EQ(f.find_first_not_of(a), 0U); + ASSERT_EQ(a.find_first_not_of(f), 0U); + ASSERT_EQ(a.find_first_not_of(d), 0U); + ASSERT_EQ(a.find_first_not_of(e), 0U); + // empty string nonsense + ASSERT_EQ(d.find_first_not_of(a), Piece::npos); + ASSERT_EQ(e.find_first_not_of(a), Piece::npos); + ASSERT_EQ(d.find_first_not_of(d), Piece::npos); + ASSERT_EQ(e.find_first_not_of(d), Piece::npos); + ASSERT_EQ(d.find_first_not_of(e), Piece::npos); + ASSERT_EQ(e.find_first_not_of(e), Piece::npos); + + TypeParam equals(TestFixture::as_string("====")); + Piece h(equals); + ASSERT_EQ(h.find_first_not_of('='), Piece::npos); + ASSERT_EQ(h.find_first_not_of('=', 3), Piece::npos); + ASSERT_EQ(h.find_first_not_of('\0'), 0U); + ASSERT_EQ(g.find_first_not_of('x'), 2U); + ASSERT_EQ(f.find_first_not_of('\0'), 0U); + ASSERT_EQ(f.find_first_not_of('\0', 3), 4U); + ASSERT_EQ(f.find_first_not_of('\0', 2), 2U); + // empty string nonsense + ASSERT_EQ(d.find_first_not_of('x'), Piece::npos); + ASSERT_EQ(e.find_first_not_of('x'), Piece::npos); + ASSERT_EQ(d.find_first_not_of('\0'), Piece::npos); + ASSERT_EQ(e.find_first_not_of('\0'), Piece::npos); + + // Piece g("xx not found bb"); + TypeParam fifty_six(TestFixture::as_string("56")); + Piece i(fifty_six); + ASSERT_EQ(h.find_last_of(a), Piece::npos); + ASSERT_EQ(g.find_last_of(a), g.size()-1); + ASSERT_EQ(a.find_last_of(b), 2U); + ASSERT_EQ(a.find_last_of(c), a.size()-1); + ASSERT_EQ(f.find_last_of(i), 6U); + ASSERT_EQ(a.find_last_of('a'), 0U); + ASSERT_EQ(a.find_last_of('b'), 1U); + ASSERT_EQ(a.find_last_of('z'), 25U); + ASSERT_EQ(a.find_last_of('a', 5), 0U); + ASSERT_EQ(a.find_last_of('b', 5), 1U); + ASSERT_EQ(a.find_last_of('b', 0), Piece::npos); + ASSERT_EQ(a.find_last_of('z', 25), 25U); + ASSERT_EQ(a.find_last_of('z', 24), Piece::npos); + ASSERT_EQ(f.find_last_of(i, 5), 5U); + ASSERT_EQ(f.find_last_of(i, 6), 6U); + ASSERT_EQ(f.find_last_of(a, 4), Piece::npos); + // empty string nonsense + ASSERT_EQ(f.find_last_of(d), Piece::npos); + ASSERT_EQ(f.find_last_of(e), Piece::npos); + ASSERT_EQ(f.find_last_of(d, 4), Piece::npos); + ASSERT_EQ(f.find_last_of(e, 4), Piece::npos); + ASSERT_EQ(d.find_last_of(d), Piece::npos); + ASSERT_EQ(d.find_last_of(e), Piece::npos); + ASSERT_EQ(e.find_last_of(d), Piece::npos); + ASSERT_EQ(e.find_last_of(e), Piece::npos); + ASSERT_EQ(d.find_last_of(f), Piece::npos); + ASSERT_EQ(e.find_last_of(f), Piece::npos); + ASSERT_EQ(d.find_last_of(d, 4), Piece::npos); + ASSERT_EQ(d.find_last_of(e, 4), Piece::npos); + ASSERT_EQ(e.find_last_of(d, 4), Piece::npos); + ASSERT_EQ(e.find_last_of(e, 4), Piece::npos); + ASSERT_EQ(d.find_last_of(f, 4), Piece::npos); + ASSERT_EQ(e.find_last_of(f, 4), Piece::npos); + + ASSERT_EQ(a.find_last_not_of(b), a.size()-1); + ASSERT_EQ(a.find_last_not_of(c), 22U); + ASSERT_EQ(b.find_last_not_of(a), Piece::npos); + ASSERT_EQ(b.find_last_not_of(b), Piece::npos); + ASSERT_EQ(f.find_last_not_of(i), 4U); + ASSERT_EQ(a.find_last_not_of(c, 24), 22U); + ASSERT_EQ(a.find_last_not_of(b, 3), 3U); + ASSERT_EQ(a.find_last_not_of(b, 2), Piece::npos); + // empty string nonsense + ASSERT_EQ(f.find_last_not_of(d), f.size()-1); + ASSERT_EQ(f.find_last_not_of(e), f.size()-1); + ASSERT_EQ(f.find_last_not_of(d, 4), 4U); + ASSERT_EQ(f.find_last_not_of(e, 4), 4U); + ASSERT_EQ(d.find_last_not_of(d), Piece::npos); + ASSERT_EQ(d.find_last_not_of(e), Piece::npos); + ASSERT_EQ(e.find_last_not_of(d), Piece::npos); + ASSERT_EQ(e.find_last_not_of(e), Piece::npos); + ASSERT_EQ(d.find_last_not_of(f), Piece::npos); + ASSERT_EQ(e.find_last_not_of(f), Piece::npos); + ASSERT_EQ(d.find_last_not_of(d, 4), Piece::npos); + ASSERT_EQ(d.find_last_not_of(e, 4), Piece::npos); + ASSERT_EQ(e.find_last_not_of(d, 4), Piece::npos); + ASSERT_EQ(e.find_last_not_of(e, 4), Piece::npos); + ASSERT_EQ(d.find_last_not_of(f, 4), Piece::npos); + ASSERT_EQ(e.find_last_not_of(f, 4), Piece::npos); + + ASSERT_EQ(h.find_last_not_of('x'), h.size() - 1); + ASSERT_EQ(h.find_last_not_of('='), Piece::npos); + ASSERT_EQ(b.find_last_not_of('c'), 1U); + ASSERT_EQ(h.find_last_not_of('x', 2), 2U); + ASSERT_EQ(h.find_last_not_of('=', 2), Piece::npos); + ASSERT_EQ(b.find_last_not_of('b', 1), 0U); + // empty string nonsense + ASSERT_EQ(d.find_last_not_of('x'), Piece::npos); + ASSERT_EQ(e.find_last_not_of('x'), Piece::npos); + ASSERT_EQ(d.find_last_not_of('\0'), Piece::npos); + ASSERT_EQ(e.find_last_not_of('\0'), Piece::npos); + + ASSERT_EQ(a.substr(0, 3), b); + ASSERT_EQ(a.substr(23), c); + ASSERT_EQ(a.substr(23, 3), c); + ASSERT_EQ(a.substr(23, 99), c); + ASSERT_EQ(a.substr(0), a); + ASSERT_EQ(a.substr(3, 2), TestFixture::as_string("de")); + // empty string nonsense + ASSERT_EQ(a.substr(99, 2), e); + ASSERT_EQ(d.substr(99), e); + ASSERT_EQ(d.substr(0, 99), e); + ASSERT_EQ(d.substr(99, 99), e); +} + +TYPED_TEST(CommonStringPieceTest, CheckCustom) { + TypeParam foobar(TestFixture::as_string("foobar")); + BasicStringPiece<TypeParam> a(foobar); + TypeParam s1(TestFixture::as_string("123")); + s1 += static_cast<typename TypeParam::value_type>('\0'); + s1 += TestFixture::as_string("456"); + BasicStringPiece<TypeParam> b(s1); + BasicStringPiece<TypeParam> e; + TypeParam s2; + + // remove_prefix + BasicStringPiece<TypeParam> c(a); + c.remove_prefix(3); + ASSERT_EQ(c, TestFixture::as_string("bar")); + c = a; + c.remove_prefix(0); + ASSERT_EQ(c, a); + c.remove_prefix(c.size()); + ASSERT_EQ(c, e); + + // remove_suffix + c = a; + c.remove_suffix(3); + ASSERT_EQ(c, TestFixture::as_string("foo")); + c = a; + c.remove_suffix(0); + ASSERT_EQ(c, a); + c.remove_suffix(c.size()); + ASSERT_EQ(c, e); + + // set + c.set(foobar.c_str()); + ASSERT_EQ(c, a); + c.set(foobar.c_str(), 6); + ASSERT_EQ(c, a); + c.set(foobar.c_str(), 0); + ASSERT_EQ(c, e); + c.set(foobar.c_str(), 7); // Note, has an embedded NULL + ASSERT_NE(c, a); + + // as_string + TypeParam s3(a.as_string().c_str(), 7); // Note, has an embedded NULL + ASSERT_EQ(c, s3); + TypeParam s4(e.as_string()); + ASSERT_TRUE(s4.empty()); + + // operator STRING_TYPE() + TypeParam s5(TypeParam(a).c_str(), 7); // Note, has an embedded NULL + ASSERT_EQ(c, s5); + TypeParam s6(e); + ASSERT_TRUE(s6.empty()); +} + +TEST(StringPieceTest, CheckCustom) { + StringPiece a("foobar"); + std::string s1("123"); + s1 += '\0'; + s1 += "456"; + StringPiece b(s1); + StringPiece e; + std::string s2; + + // CopyToString + a.CopyToString(&s2); + ASSERT_EQ(s2.size(), 6U); + ASSERT_EQ(s2, "foobar"); + b.CopyToString(&s2); + ASSERT_EQ(s2.size(), 7U); + ASSERT_EQ(s1, s2); + e.CopyToString(&s2); + ASSERT_TRUE(s2.empty()); + + // AppendToString + s2.erase(); + a.AppendToString(&s2); + ASSERT_EQ(s2.size(), 6U); + ASSERT_EQ(s2, "foobar"); + a.AppendToString(&s2); + ASSERT_EQ(s2.size(), 12U); + ASSERT_EQ(s2, "foobarfoobar"); + + // starts_with + ASSERT_TRUE(a.starts_with(a)); + ASSERT_TRUE(a.starts_with("foo")); + ASSERT_TRUE(a.starts_with(e)); + ASSERT_TRUE(b.starts_with(s1)); + ASSERT_TRUE(b.starts_with(b)); + ASSERT_TRUE(b.starts_with(e)); + ASSERT_TRUE(e.starts_with("")); + ASSERT_TRUE(!a.starts_with(b)); + ASSERT_TRUE(!b.starts_with(a)); + ASSERT_TRUE(!e.starts_with(a)); + + // ends with + ASSERT_TRUE(a.ends_with(a)); + ASSERT_TRUE(a.ends_with("bar")); + ASSERT_TRUE(a.ends_with(e)); + ASSERT_TRUE(b.ends_with(s1)); + ASSERT_TRUE(b.ends_with(b)); + ASSERT_TRUE(b.ends_with(e)); + ASSERT_TRUE(e.ends_with("")); + ASSERT_TRUE(!a.ends_with(b)); + ASSERT_TRUE(!b.ends_with(a)); + ASSERT_TRUE(!e.ends_with(a)); + + StringPiece c; + c.set("foobar", 6); + ASSERT_EQ(c, a); + c.set("foobar", 0); + ASSERT_EQ(c, e); + c.set("foobar", 7); + ASSERT_NE(c, a); +} + +TYPED_TEST(CommonStringPieceTest, CheckNULL) { + // we used to crash here, but now we don't. + BasicStringPiece<TypeParam> s(nullptr); + ASSERT_EQ(s.data(), nullptr); + ASSERT_EQ(s.size(), 0U); + + s.set(nullptr); + ASSERT_EQ(s.data(), nullptr); + ASSERT_EQ(s.size(), 0U); + + TypeParam str(s); + ASSERT_EQ(str.length(), 0U); + ASSERT_EQ(str, TypeParam()); + + str = s.as_string(); + ASSERT_EQ(str.length(), 0U); + ASSERT_EQ(str, TypeParam()); +} + +TYPED_TEST(CommonStringPieceTest, CheckComparisons2) { + TypeParam alphabet(TestFixture::as_string("abcdefghijklmnopqrstuvwxyz")); + TypeParam alphabet_z(TestFixture::as_string("abcdefghijklmnopqrstuvwxyzz")); + TypeParam alphabet_y(TestFixture::as_string("abcdefghijklmnopqrstuvwxyy")); + BasicStringPiece<TypeParam> abc(alphabet); + + // check comparison operations on strings longer than 4 bytes. + ASSERT_EQ(abc, BasicStringPiece<TypeParam>(alphabet)); + ASSERT_EQ(abc.compare(BasicStringPiece<TypeParam>(alphabet)), 0); + + ASSERT_TRUE(abc < BasicStringPiece<TypeParam>(alphabet_z)); + ASSERT_LT(abc.compare(BasicStringPiece<TypeParam>(alphabet_z)), 0); + + ASSERT_TRUE(abc > BasicStringPiece<TypeParam>(alphabet_y)); + ASSERT_GT(abc.compare(BasicStringPiece<TypeParam>(alphabet_y)), 0); +} + +// Test operations only supported by std::string version. +TEST(StringPieceTest, CheckComparisons2) { + StringPiece abc("abcdefghijklmnopqrstuvwxyz"); + + // starts_with + ASSERT_TRUE(abc.starts_with(abc)); + ASSERT_TRUE(abc.starts_with("abcdefghijklm")); + ASSERT_TRUE(!abc.starts_with("abcdefguvwxyz")); + + // ends_with + ASSERT_TRUE(abc.ends_with(abc)); + ASSERT_TRUE(!abc.ends_with("abcdefguvwxyz")); + ASSERT_TRUE(abc.ends_with("nopqrstuvwxyz")); +} + +TYPED_TEST(CommonStringPieceTest, StringCompareNotAmbiguous) { + ASSERT_TRUE(TestFixture::as_string("hello").c_str() == + TestFixture::as_string("hello")); + ASSERT_TRUE(TestFixture::as_string("hello").c_str() < + TestFixture::as_string("world")); +} + +TYPED_TEST(CommonStringPieceTest, HeterogenousStringPieceEquals) { + TypeParam hello(TestFixture::as_string("hello")); + + ASSERT_EQ(BasicStringPiece<TypeParam>(hello), hello); + ASSERT_EQ(hello.c_str(), BasicStringPiece<TypeParam>(hello)); +} + +// string16-specific stuff +TEST(StringPiece16Test, CheckSTL) { + // Check some non-ascii characters. + string16 fifth(ASCIIToUTF16("123")); + fifth.push_back(0x0000); + fifth.push_back(0xd8c5); + fifth.push_back(0xdffe); + StringPiece16 f(fifth); + + ASSERT_EQ(f[3], '\0'); + ASSERT_EQ(f[5], static_cast<char16>(0xdffe)); + + ASSERT_EQ(f.size(), 6U); +} + + + +TEST(StringPiece16Test, CheckConversion) { + // Make sure that we can convert from UTF8 to UTF16 and back. We use a two + // byte character (G clef) to test this. + ASSERT_EQ( + UTF16ToUTF8( + StringPiece16(UTF8ToUTF16("\xf0\x9d\x84\x9e")).as_string()), + "\xf0\x9d\x84\x9e"); +} + +TYPED_TEST(CommonStringPieceTest, CheckConstructors) { + TypeParam str(TestFixture::as_string("hello world")); + TypeParam empty; + + ASSERT_EQ(str, BasicStringPiece<TypeParam>(str)); + ASSERT_EQ(str, BasicStringPiece<TypeParam>(str.c_str())); + ASSERT_TRUE(TestFixture::as_string("hello") == + BasicStringPiece<TypeParam>(str.c_str(), 5)); + ASSERT_EQ( + empty, + BasicStringPiece<TypeParam>( + str.c_str(), + static_cast<typename BasicStringPiece<TypeParam>::size_type>(0))); + ASSERT_EQ(empty, BasicStringPiece<TypeParam>(nullptr)); + ASSERT_TRUE( + empty == + BasicStringPiece<TypeParam>( + nullptr, + static_cast<typename BasicStringPiece<TypeParam>::size_type>(0))); + ASSERT_EQ(empty, BasicStringPiece<TypeParam>()); + ASSERT_EQ(str, BasicStringPiece<TypeParam>(str.begin(), str.end())); + ASSERT_EQ(empty, BasicStringPiece<TypeParam>(str.begin(), str.begin())); + ASSERT_EQ(empty, BasicStringPiece<TypeParam>(empty)); + ASSERT_EQ(empty, BasicStringPiece<TypeParam>(empty.begin(), empty.end())); +} + +TEST(StringPieceTest, ConstexprCtor) { + { + constexpr StringPiece piece; + std::ignore = piece; + } + + { + constexpr StringPiece piece("abc"); + std::ignore = piece; + } + + { + constexpr StringPiece piece("abc", 2); + std::ignore = piece; + } +} + +TEST(StringPieceTest, OutOfBoundsDeath) { + { + constexpr StringPiece piece; + ASSERT_DEATH_IF_SUPPORTED(piece[0], ""); + } + + { + constexpr StringPiece piece; + ASSERT_DEATH_IF_SUPPORTED(piece.front(), ""); + } + + { + constexpr StringPiece piece; + ASSERT_DEATH_IF_SUPPORTED(piece.back(), ""); + } + + { + StringPiece piece; + ASSERT_DEATH_IF_SUPPORTED(piece.remove_suffix(1), ""); + } + + { + StringPiece piece; + ASSERT_DEATH_IF_SUPPORTED(piece.remove_prefix(1), ""); + } +} + +TEST(StringPieceTest, ConstexprData) { + { + constexpr StringPiece piece; + static_assert(piece.data() == nullptr, ""); + } + + { + constexpr StringPiece piece("abc"); + static_assert(piece.data()[0] == 'a', ""); + static_assert(piece.data()[1] == 'b', ""); + static_assert(piece.data()[2] == 'c', ""); + } + + { + constexpr StringPiece piece("def", 2); + static_assert(piece.data()[0] == 'd', ""); + static_assert(piece.data()[1] == 'e', ""); + } +} + +TEST(StringPieceTest, ConstexprSize) { + { + constexpr StringPiece piece; + static_assert(piece.size() == 0, ""); + } + + { + constexpr StringPiece piece("abc"); + static_assert(piece.size() == 3, ""); + } + + { + constexpr StringPiece piece("def", 2); + static_assert(piece.size() == 2, ""); + } +} + +TEST(StringPieceTest, Compare) { + constexpr StringPiece piece = "def"; + + static_assert(piece.compare("ab") == 1, ""); + static_assert(piece.compare("abc") == 1, ""); + static_assert(piece.compare("abcd") == 1, ""); + static_assert(piece.compare("de") == 1, ""); + static_assert(piece.compare("def") == 0, ""); + static_assert(piece.compare("defg") == -1, ""); + static_assert(piece.compare("gh") == -1, ""); + static_assert(piece.compare("ghi") == -1, ""); + static_assert(piece.compare("ghij") == -1, ""); +} + +TEST(StringPieceTest, StartsWith) { + constexpr StringPiece piece("abc"); + + static_assert(piece.starts_with(""), ""); + static_assert(piece.starts_with("a"), ""); + static_assert(piece.starts_with("ab"), ""); + static_assert(piece.starts_with("abc"), ""); + + static_assert(!piece.starts_with("b"), ""); + static_assert(!piece.starts_with("bc"), ""); + + static_assert(!piece.starts_with("abcd"), ""); +} + +TEST(StringPieceTest, EndsWith) { + constexpr StringPiece piece("abc"); + + static_assert(piece.ends_with(""), ""); + static_assert(piece.ends_with("c"), ""); + static_assert(piece.ends_with("bc"), ""); + static_assert(piece.ends_with("abc"), ""); + + static_assert(!piece.ends_with("a"), ""); + static_assert(!piece.ends_with("ab"), ""); + + static_assert(!piece.ends_with("abcd"), ""); +} + +} // namespace base
diff --git a/base/strings/string_split.cc b/base/strings/string_split.cc new file mode 100644 index 0000000..ef9c74d --- /dev/null +++ b/base/strings/string_split.cc
@@ -0,0 +1,277 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string_split.h" + +#include <stddef.h> + +#include "polyfills/base/logging.h" +#include "base/strings/string_util.h" +#include "base/third_party/icu/icu_utf.h" + +namespace gurl_base { + +namespace { + +// PieceToOutputType converts a StringPiece as needed to a given output type, +// which is either the same type of StringPiece (a NOP) or the corresponding +// non-piece string type. +// +// The default converter is a NOP, it works when the OutputType is the +// correct StringPiece. +template<typename Str, typename OutputType> +OutputType PieceToOutputType(BasicStringPiece<Str> piece) { + return piece; +} +template<> // Convert StringPiece to std::string +std::string PieceToOutputType<std::string, std::string>(StringPiece piece) { + return piece.as_string(); +} +template<> // Convert StringPiece16 to string16. +string16 PieceToOutputType<string16, string16>(StringPiece16 piece) { + return piece.as_string(); +} + +// Returns either the ASCII or UTF-16 whitespace. +template<typename Str> BasicStringPiece<Str> WhitespaceForType(); +template<> StringPiece16 WhitespaceForType<string16>() { + return kWhitespaceUTF16; +} +template<> StringPiece WhitespaceForType<std::string>() { + return kWhitespaceASCII; +} + +// Optimize the single-character case to call find() on the string instead, +// since this is the common case and can be made faster. This could have been +// done with template specialization too, but would have been less clear. +// +// There is no corresponding FindFirstNotOf because StringPiece already +// implements these different versions that do the optimized searching. +size_t FindFirstOf(StringPiece piece, char c, size_t pos) { + return piece.find(c, pos); +} +size_t FindFirstOf(StringPiece16 piece, char16 c, size_t pos) { + return piece.find(c, pos); +} +size_t FindFirstOf(StringPiece piece, StringPiece one_of, size_t pos) { + return piece.find_first_of(one_of, pos); +} +size_t FindFirstOf(StringPiece16 piece, StringPiece16 one_of, size_t pos) { + return piece.find_first_of(one_of, pos); +} + +// General string splitter template. Can take 8- or 16-bit input, can produce +// the corresponding string or StringPiece output, and can take single- or +// multiple-character delimiters. +// +// DelimiterType is either a character (Str::value_type) or a string piece of +// multiple characters (BasicStringPiece<Str>). StringPiece has a version of +// find for both of these cases, and the single-character version is the most +// common and can be implemented faster, which is why this is a template. +template<typename Str, typename OutputStringType, typename DelimiterType> +static std::vector<OutputStringType> SplitStringT( + BasicStringPiece<Str> str, + DelimiterType delimiter, + WhitespaceHandling whitespace, + SplitResult result_type) { + std::vector<OutputStringType> result; + if (str.empty()) + return result; + + size_t start = 0; + while (start != Str::npos) { + size_t end = FindFirstOf(str, delimiter, start); + + BasicStringPiece<Str> piece; + if (end == Str::npos) { + piece = str.substr(start); + start = Str::npos; + } else { + piece = str.substr(start, end - start); + start = end + 1; + } + + if (whitespace == TRIM_WHITESPACE) + piece = TrimString(piece, WhitespaceForType<Str>(), TRIM_ALL); + + if (result_type == SPLIT_WANT_ALL || !piece.empty()) + result.push_back(PieceToOutputType<Str, OutputStringType>(piece)); + } + return result; +} + +bool AppendStringKeyValue(StringPiece input, + char delimiter, + StringPairs* result) { + // Always append a new item regardless of success (it might be empty). The + // below code will copy the strings directly into the result pair. + result->resize(result->size() + 1); + auto& result_pair = result->back(); + + // Find the delimiter. + size_t end_key_pos = input.find_first_of(delimiter); + if (end_key_pos == std::string::npos) { + DVLOG(1) << "cannot find delimiter in: " << input; + return false; // No delimiter. + } + input.substr(0, end_key_pos).CopyToString(&result_pair.first); + + // Find the value string. + StringPiece remains = input.substr(end_key_pos, input.size() - end_key_pos); + size_t begin_value_pos = remains.find_first_not_of(delimiter); + if (begin_value_pos == StringPiece::npos) { + DVLOG(1) << "cannot parse value from input: " << input; + return false; // No value. + } + remains.substr(begin_value_pos, remains.size() - begin_value_pos) + .CopyToString(&result_pair.second); + + return true; +} + +template <typename Str, typename OutputStringType> +void SplitStringUsingSubstrT(BasicStringPiece<Str> input, + BasicStringPiece<Str> delimiter, + WhitespaceHandling whitespace, + SplitResult result_type, + std::vector<OutputStringType>* result) { + using Piece = BasicStringPiece<Str>; + using size_type = typename Piece::size_type; + + result->clear(); + for (size_type begin_index = 0, end_index = 0; end_index != Piece::npos; + begin_index = end_index + delimiter.size()) { + end_index = input.find(delimiter, begin_index); + Piece term = end_index == Piece::npos + ? input.substr(begin_index) + : input.substr(begin_index, end_index - begin_index); + + if (whitespace == TRIM_WHITESPACE) + term = TrimString(term, WhitespaceForType<Str>(), TRIM_ALL); + + if (result_type == SPLIT_WANT_ALL || !term.empty()) + result->push_back(PieceToOutputType<Str, OutputStringType>(term)); + } +} + +} // namespace + +std::vector<std::string> SplitString(StringPiece input, + StringPiece separators, + WhitespaceHandling whitespace, + SplitResult result_type) { + if (separators.size() == 1) { + return SplitStringT<std::string, std::string, char>( + input, separators[0], whitespace, result_type); + } + return SplitStringT<std::string, std::string, StringPiece>( + input, separators, whitespace, result_type); +} + +std::vector<string16> SplitString(StringPiece16 input, + StringPiece16 separators, + WhitespaceHandling whitespace, + SplitResult result_type) { + if (separators.size() == 1) { + return SplitStringT<string16, string16, char16>( + input, separators[0], whitespace, result_type); + } + return SplitStringT<string16, string16, StringPiece16>( + input, separators, whitespace, result_type); +} + +std::vector<StringPiece> SplitStringPiece(StringPiece input, + StringPiece separators, + WhitespaceHandling whitespace, + SplitResult result_type) { + if (separators.size() == 1) { + return SplitStringT<std::string, StringPiece, char>( + input, separators[0], whitespace, result_type); + } + return SplitStringT<std::string, StringPiece, StringPiece>( + input, separators, whitespace, result_type); +} + +std::vector<StringPiece16> SplitStringPiece(StringPiece16 input, + StringPiece16 separators, + WhitespaceHandling whitespace, + SplitResult result_type) { + if (separators.size() == 1) { + return SplitStringT<string16, StringPiece16, char16>( + input, separators[0], whitespace, result_type); + } + return SplitStringT<string16, StringPiece16, StringPiece16>( + input, separators, whitespace, result_type); +} + +bool SplitStringIntoKeyValuePairs(StringPiece input, + char key_value_delimiter, + char key_value_pair_delimiter, + StringPairs* key_value_pairs) { + return SplitStringIntoKeyValuePairsUsingSubstr( + input, key_value_delimiter, StringPiece(&key_value_pair_delimiter, 1), + key_value_pairs); +} + +bool SplitStringIntoKeyValuePairsUsingSubstr( + StringPiece input, + char key_value_delimiter, + StringPiece key_value_pair_delimiter, + StringPairs* key_value_pairs) { + key_value_pairs->clear(); + + std::vector<StringPiece> pairs = SplitStringPieceUsingSubstr( + input, key_value_pair_delimiter, TRIM_WHITESPACE, SPLIT_WANT_NONEMPTY); + key_value_pairs->reserve(pairs.size()); + + bool success = true; + for (const StringPiece& pair : pairs) { + if (!AppendStringKeyValue(pair, key_value_delimiter, key_value_pairs)) { + // Don't return here, to allow for pairs without associated + // value or key; just record that the split failed. + success = false; + } + } + return success; +} + +std::vector<string16> SplitStringUsingSubstr(StringPiece16 input, + StringPiece16 delimiter, + WhitespaceHandling whitespace, + SplitResult result_type) { + std::vector<string16> result; + SplitStringUsingSubstrT(input, delimiter, whitespace, result_type, &result); + return result; +} + +std::vector<std::string> SplitStringUsingSubstr(StringPiece input, + StringPiece delimiter, + WhitespaceHandling whitespace, + SplitResult result_type) { + std::vector<std::string> result; + SplitStringUsingSubstrT(input, delimiter, whitespace, result_type, &result); + return result; +} + +std::vector<StringPiece16> SplitStringPieceUsingSubstr( + StringPiece16 input, + StringPiece16 delimiter, + WhitespaceHandling whitespace, + SplitResult result_type) { + std::vector<StringPiece16> result; + SplitStringUsingSubstrT(input, delimiter, whitespace, result_type, &result); + return result; +} + +std::vector<StringPiece> SplitStringPieceUsingSubstr( + StringPiece input, + StringPiece delimiter, + WhitespaceHandling whitespace, + SplitResult result_type) { + std::vector<StringPiece> result; + SplitStringUsingSubstrT(input, delimiter, whitespace, result_type, &result); + return result; +} + +} // namespace base
diff --git a/base/strings/string_split.h b/base/strings/string_split.h new file mode 100644 index 0000000..1894d05 --- /dev/null +++ b/base/strings/string_split.h
@@ -0,0 +1,137 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_STRING_SPLIT_H_ +#define BASE_STRINGS_STRING_SPLIT_H_ + +#include <string> +#include <utility> +#include <vector> + +#include "polyfills/base/base_export.h" +#include "base/strings/string16.h" +#include "base/strings/string_piece.h" + +namespace gurl_base { + +enum WhitespaceHandling { + KEEP_WHITESPACE, + TRIM_WHITESPACE, +}; + +enum SplitResult { + // Strictly return all results. + // + // If the input is ",," and the separator is ',' this will return a + // vector of three empty strings. + SPLIT_WANT_ALL, + + // Only nonempty results will be added to the results. Multiple separators + // will be coalesced. Separators at the beginning and end of the input will + // be ignored. With TRIM_WHITESPACE, whitespace-only results will be dropped. + // + // If the input is ",," and the separator is ',', this will return an empty + // vector. + SPLIT_WANT_NONEMPTY, +}; + +// Split the given string on ANY of the given separators, returning copies of +// the result. +// +// To split on either commas or semicolons, keeping all whitespace: +// +// std::vector<std::string> tokens = gurl_base::SplitString( +// input, ",;", gurl_base::KEEP_WHITESPACE, gurl_base::SPLIT_WANT_ALL); +BASE_EXPORT std::vector<std::string> SplitString( + StringPiece input, + StringPiece separators, + WhitespaceHandling whitespace, + SplitResult result_type); +BASE_EXPORT std::vector<string16> SplitString( + StringPiece16 input, + StringPiece16 separators, + WhitespaceHandling whitespace, + SplitResult result_type); + +// Like SplitString above except it returns a vector of StringPieces which +// reference the original buffer without copying. Although you have to be +// careful to keep the original string unmodified, this provides an efficient +// way to iterate through tokens in a string. +// +// To iterate through all whitespace-separated tokens in an input string: +// +// for (const auto& cur : +// gurl_base::SplitStringPiece(input, gurl_base::kWhitespaceASCII, +// gurl_base::KEEP_WHITESPACE, +// gurl_base::SPLIT_WANT_NONEMPTY)) { +// ... +BASE_EXPORT std::vector<StringPiece> SplitStringPiece( + StringPiece input, + StringPiece separators, + WhitespaceHandling whitespace, + SplitResult result_type); +BASE_EXPORT std::vector<StringPiece16> SplitStringPiece( + StringPiece16 input, + StringPiece16 separators, + WhitespaceHandling whitespace, + SplitResult result_type); + +using StringPairs = std::vector<std::pair<std::string, std::string>>; + +// Splits |line| into key value pairs according to the given delimiters and +// removes whitespace leading each key and trailing each value. Returns true +// only if each pair has a non-empty key and value. |key_value_pairs| will +// include ("","") pairs for entries without |key_value_delimiter|. +BASE_EXPORT bool SplitStringIntoKeyValuePairs(StringPiece input, + char key_value_delimiter, + char key_value_pair_delimiter, + StringPairs* key_value_pairs); + +// Similar to SplitStringIntoKeyValuePairs, but use a substring +// |key_value_pair_delimiter| instead of a single char. +BASE_EXPORT bool SplitStringIntoKeyValuePairsUsingSubstr( + StringPiece input, + char key_value_delimiter, + StringPiece key_value_pair_delimiter, + StringPairs* key_value_pairs); + +// Similar to SplitString, but use a substring delimiter instead of a list of +// characters that are all possible delimiters. +BASE_EXPORT std::vector<string16> SplitStringUsingSubstr( + StringPiece16 input, + StringPiece16 delimiter, + WhitespaceHandling whitespace, + SplitResult result_type); +BASE_EXPORT std::vector<std::string> SplitStringUsingSubstr( + StringPiece input, + StringPiece delimiter, + WhitespaceHandling whitespace, + SplitResult result_type); + +// Like SplitStringUsingSubstr above except it returns a vector of StringPieces +// which reference the original buffer without copying. Although you have to be +// careful to keep the original string unmodified, this provides an efficient +// way to iterate through tokens in a string. +// +// To iterate through all newline-separated tokens in an input string: +// +// for (const auto& cur : +// gurl_base::SplitStringUsingSubstr(input, "\r\n", +// gurl_base::KEEP_WHITESPACE, +// gurl_base::SPLIT_WANT_NONEMPTY)) { +// ... +BASE_EXPORT std::vector<StringPiece16> SplitStringPieceUsingSubstr( + StringPiece16 input, + StringPiece16 delimiter, + WhitespaceHandling whitespace, + SplitResult result_type); +BASE_EXPORT std::vector<StringPiece> SplitStringPieceUsingSubstr( + StringPiece input, + StringPiece delimiter, + WhitespaceHandling whitespace, + SplitResult result_type); + +} // namespace base + +#endif // BASE_STRINGS_STRING_SPLIT_H_
diff --git a/base/strings/string_split_unittest.cc b/base/strings/string_split_unittest.cc new file mode 100644 index 0000000..993450a --- /dev/null +++ b/base/strings/string_split_unittest.cc
@@ -0,0 +1,448 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string_split.h" + +#include <stddef.h> + +#include "base/macros.h" +#include "base/strings/string_util.h" +#include "base/strings/utf_string_conversions.h" +#include "testing/gmock/include/gmock/gmock.h" +#include "testing/gtest/include/gtest/gtest.h" + +using ::testing::ElementsAre; + +namespace gurl_base { + +class SplitStringIntoKeyValuePairsTest : public testing::Test { + protected: + gurl_base::StringPairs kv_pairs; +}; + +using SplitStringIntoKeyValuePairsUsingSubstrTest = + SplitStringIntoKeyValuePairsTest; + +TEST_F(SplitStringIntoKeyValuePairsUsingSubstrTest, EmptyString) { + EXPECT_TRUE( + SplitStringIntoKeyValuePairsUsingSubstr(std::string(), + ':', // Key-value delimiter + ",", // Key-value pair delimiter + &kv_pairs)); + EXPECT_TRUE(kv_pairs.empty()); +} + +TEST_F(SplitStringIntoKeyValuePairsUsingSubstrTest, MissingKeyValueDelimiter) { + EXPECT_FALSE( + SplitStringIntoKeyValuePairsUsingSubstr("key1,,key2:value2", + ':', // Key-value delimiter + ",,", // Key-value pair delimiter + &kv_pairs)); + ASSERT_EQ(2U, kv_pairs.size()); + EXPECT_TRUE(kv_pairs[0].first.empty()); + EXPECT_TRUE(kv_pairs[0].second.empty()); + EXPECT_EQ("key2", kv_pairs[1].first); + EXPECT_EQ("value2", kv_pairs[1].second); +} + +TEST_F(SplitStringIntoKeyValuePairsUsingSubstrTest, + MissingKeyValuePairDelimeter) { + EXPECT_TRUE(SplitStringIntoKeyValuePairsUsingSubstr( + "key1:value1,,key3:value3", + ':', // Key-value delimiter + ",,,", // Key-value pair delimiter + &kv_pairs)); + ASSERT_EQ(1U, kv_pairs.size()); + EXPECT_EQ("key1", kv_pairs[0].first); + EXPECT_EQ("value1,,key3:value3", kv_pairs[0].second); +} + +TEST_F(SplitStringIntoKeyValuePairsUsingSubstrTest, UntrimmedWhitespace) { + EXPECT_TRUE( + SplitStringIntoKeyValuePairsUsingSubstr("key1 : value1", + ':', // Key-value delimiter + ",", // Key-value pair delimiter + &kv_pairs)); + ASSERT_EQ(1U, kv_pairs.size()); + EXPECT_EQ("key1 ", kv_pairs[0].first); + EXPECT_EQ(" value1", kv_pairs[0].second); +} + +TEST_F(SplitStringIntoKeyValuePairsUsingSubstrTest, OnlySplitAtGivenSeparator) { + std::string a("a ?!@#$%^&*()_+:/{}\\\t\nb"); + EXPECT_TRUE( + SplitStringIntoKeyValuePairsUsingSubstr(a + "X" + a + "XY" + a + "YX" + a, + 'X', // Key-value delimiter + "XY", // Key-value pair delimiter + &kv_pairs)); + ASSERT_EQ(2U, kv_pairs.size()); + EXPECT_EQ(a, kv_pairs[0].first); + EXPECT_EQ(a, kv_pairs[0].second); + EXPECT_EQ(a + 'Y', kv_pairs[1].first); + EXPECT_EQ(a, kv_pairs[1].second); +} + +TEST_F(SplitStringIntoKeyValuePairsTest, EmptyString) { + EXPECT_TRUE(SplitStringIntoKeyValuePairs(std::string(), + ':', // Key-value delimiter + ',', // Key-value pair delimiter + &kv_pairs)); + EXPECT_TRUE(kv_pairs.empty()); +} + +TEST_F(SplitStringIntoKeyValuePairsTest, MissingKeyValueDelimiter) { + EXPECT_FALSE(SplitStringIntoKeyValuePairs("key1,key2:value2", + ':', // Key-value delimiter + ',', // Key-value pair delimiter + &kv_pairs)); + ASSERT_EQ(2U, kv_pairs.size()); + EXPECT_TRUE(kv_pairs[0].first.empty()); + EXPECT_TRUE(kv_pairs[0].second.empty()); + EXPECT_EQ("key2", kv_pairs[1].first); + EXPECT_EQ("value2", kv_pairs[1].second); +} + +TEST_F(SplitStringIntoKeyValuePairsTest, EmptyKeyWithKeyValueDelimiter) { + EXPECT_TRUE(SplitStringIntoKeyValuePairs(":value1,key2:value2", + ':', // Key-value delimiter + ',', // Key-value pair delimiter + &kv_pairs)); + ASSERT_EQ(2U, kv_pairs.size()); + EXPECT_TRUE(kv_pairs[0].first.empty()); + EXPECT_EQ("value1", kv_pairs[0].second); + EXPECT_EQ("key2", kv_pairs[1].first); + EXPECT_EQ("value2", kv_pairs[1].second); +} + +TEST_F(SplitStringIntoKeyValuePairsTest, TrailingAndLeadingPairDelimiter) { + EXPECT_TRUE(SplitStringIntoKeyValuePairs(",key1:value1,key2:value2,", + ':', // Key-value delimiter + ',', // Key-value pair delimiter + &kv_pairs)); + ASSERT_EQ(2U, kv_pairs.size()); + EXPECT_EQ("key1", kv_pairs[0].first); + EXPECT_EQ("value1", kv_pairs[0].second); + EXPECT_EQ("key2", kv_pairs[1].first); + EXPECT_EQ("value2", kv_pairs[1].second); +} + +TEST_F(SplitStringIntoKeyValuePairsTest, EmptyPair) { + EXPECT_TRUE(SplitStringIntoKeyValuePairs("key1:value1,,key3:value3", + ':', // Key-value delimiter + ',', // Key-value pair delimiter + &kv_pairs)); + ASSERT_EQ(2U, kv_pairs.size()); + EXPECT_EQ("key1", kv_pairs[0].first); + EXPECT_EQ("value1", kv_pairs[0].second); + EXPECT_EQ("key3", kv_pairs[1].first); + EXPECT_EQ("value3", kv_pairs[1].second); +} + +TEST_F(SplitStringIntoKeyValuePairsTest, EmptyValue) { + EXPECT_FALSE(SplitStringIntoKeyValuePairs("key1:,key2:value2", + ':', // Key-value delimiter + ',', // Key-value pair delimiter + &kv_pairs)); + ASSERT_EQ(2U, kv_pairs.size()); + EXPECT_EQ("key1", kv_pairs[0].first); + EXPECT_EQ("", kv_pairs[0].second); + EXPECT_EQ("key2", kv_pairs[1].first); + EXPECT_EQ("value2", kv_pairs[1].second); +} + +TEST_F(SplitStringIntoKeyValuePairsTest, UntrimmedWhitespace) { + EXPECT_TRUE(SplitStringIntoKeyValuePairs("key1 : value1", + ':', // Key-value delimiter + ',', // Key-value pair delimiter + &kv_pairs)); + ASSERT_EQ(1U, kv_pairs.size()); + EXPECT_EQ("key1 ", kv_pairs[0].first); + EXPECT_EQ(" value1", kv_pairs[0].second); +} + +TEST_F(SplitStringIntoKeyValuePairsTest, TrimmedWhitespace) { + EXPECT_TRUE(SplitStringIntoKeyValuePairs("key1:value1 , key2:value2", + ':', // Key-value delimiter + ',', // Key-value pair delimiter + &kv_pairs)); + ASSERT_EQ(2U, kv_pairs.size()); + EXPECT_EQ("key1", kv_pairs[0].first); + EXPECT_EQ("value1", kv_pairs[0].second); + EXPECT_EQ("key2", kv_pairs[1].first); + EXPECT_EQ("value2", kv_pairs[1].second); +} + +TEST_F(SplitStringIntoKeyValuePairsTest, MultipleKeyValueDelimiters) { + EXPECT_TRUE(SplitStringIntoKeyValuePairs("key1:::value1,key2:value2", + ':', // Key-value delimiter + ',', // Key-value pair delimiter + &kv_pairs)); + ASSERT_EQ(2U, kv_pairs.size()); + EXPECT_EQ("key1", kv_pairs[0].first); + EXPECT_EQ("value1", kv_pairs[0].second); + EXPECT_EQ("key2", kv_pairs[1].first); + EXPECT_EQ("value2", kv_pairs[1].second); +} + +TEST_F(SplitStringIntoKeyValuePairsTest, OnlySplitAtGivenSeparator) { + std::string a("a ?!@#$%^&*()_+:/{}\\\t\nb"); + EXPECT_TRUE(SplitStringIntoKeyValuePairs(a + "X" + a + "Y" + a + "X" + a, + 'X', // Key-value delimiter + 'Y', // Key-value pair delimiter + &kv_pairs)); + ASSERT_EQ(2U, kv_pairs.size()); + EXPECT_EQ(a, kv_pairs[0].first); + EXPECT_EQ(a, kv_pairs[0].second); + EXPECT_EQ(a, kv_pairs[1].first); + EXPECT_EQ(a, kv_pairs[1].second); +} + + +TEST_F(SplitStringIntoKeyValuePairsTest, DelimiterInValue) { + EXPECT_TRUE(SplitStringIntoKeyValuePairs("key1:va:ue1,key2:value2", + ':', // Key-value delimiter + ',', // Key-value pair delimiter + &kv_pairs)); + ASSERT_EQ(2U, kv_pairs.size()); + EXPECT_EQ("key1", kv_pairs[0].first); + EXPECT_EQ("va:ue1", kv_pairs[0].second); + EXPECT_EQ("key2", kv_pairs[1].first); + EXPECT_EQ("value2", kv_pairs[1].second); +} + +TEST(SplitStringUsingSubstrTest, EmptyString) { + std::vector<std::string> results = SplitStringUsingSubstr( + std::string(), "DELIMITER", TRIM_WHITESPACE, SPLIT_WANT_ALL); + ASSERT_EQ(1u, results.size()); + EXPECT_THAT(results, ElementsAre("")); +} + +TEST(StringUtilTest, SplitString_Basics) { + std::vector<std::string> r; + + r = SplitString(std::string(), ",:;", KEEP_WHITESPACE, SPLIT_WANT_ALL); + EXPECT_TRUE(r.empty()); + + // Empty separator list + r = SplitString("hello, world", "", KEEP_WHITESPACE, SPLIT_WANT_ALL); + ASSERT_EQ(1u, r.size()); + EXPECT_EQ("hello, world", r[0]); + + // Should split on any of the separators. + r = SplitString("::,,;;", ",:;", KEEP_WHITESPACE, SPLIT_WANT_ALL); + ASSERT_EQ(7u, r.size()); + for (auto str : r) + ASSERT_TRUE(str.empty()); + + r = SplitString("red, green; blue:", ",:;", TRIM_WHITESPACE, + SPLIT_WANT_NONEMPTY); + ASSERT_EQ(3u, r.size()); + EXPECT_EQ("red", r[0]); + EXPECT_EQ("green", r[1]); + EXPECT_EQ("blue", r[2]); + + // Want to split a string along whitespace sequences. + r = SplitString(" red green \tblue\n", " \t\n", TRIM_WHITESPACE, + SPLIT_WANT_NONEMPTY); + ASSERT_EQ(3u, r.size()); + EXPECT_EQ("red", r[0]); + EXPECT_EQ("green", r[1]); + EXPECT_EQ("blue", r[2]); + + // Weird case of splitting on spaces but not trimming. + r = SplitString(" red ", " ", TRIM_WHITESPACE, SPLIT_WANT_ALL); + ASSERT_EQ(3u, r.size()); + EXPECT_EQ("", r[0]); // Before the first space. + EXPECT_EQ("red", r[1]); + EXPECT_EQ("", r[2]); // After the last space. +} + +TEST(StringUtilTest, SplitString_WhitespaceAndResultType) { + std::vector<std::string> r; + + // Empty input handling. + r = SplitString(std::string(), ",", KEEP_WHITESPACE, SPLIT_WANT_ALL); + EXPECT_TRUE(r.empty()); + r = SplitString(std::string(), ",", KEEP_WHITESPACE, SPLIT_WANT_NONEMPTY); + EXPECT_TRUE(r.empty()); + + // Input string is space and we're trimming. + r = SplitString(" ", ",", TRIM_WHITESPACE, SPLIT_WANT_ALL); + ASSERT_EQ(1u, r.size()); + EXPECT_EQ("", r[0]); + r = SplitString(" ", ",", TRIM_WHITESPACE, SPLIT_WANT_NONEMPTY); + EXPECT_TRUE(r.empty()); + + // Test all 4 combinations of flags on ", ,". + r = SplitString(", ,", ",", KEEP_WHITESPACE, SPLIT_WANT_ALL); + ASSERT_EQ(3u, r.size()); + EXPECT_EQ("", r[0]); + EXPECT_EQ(" ", r[1]); + EXPECT_EQ("", r[2]); + r = SplitString(", ,", ",", KEEP_WHITESPACE, SPLIT_WANT_NONEMPTY); + ASSERT_EQ(1u, r.size()); + ASSERT_EQ(" ", r[0]); + r = SplitString(", ,", ",", TRIM_WHITESPACE, SPLIT_WANT_ALL); + ASSERT_EQ(3u, r.size()); + EXPECT_EQ("", r[0]); + EXPECT_EQ("", r[1]); + EXPECT_EQ("", r[2]); + r = SplitString(", ,", ",", TRIM_WHITESPACE, SPLIT_WANT_NONEMPTY); + ASSERT_TRUE(r.empty()); +} + +TEST(SplitStringUsingSubstrTest, StringWithNoDelimiter) { + std::vector<std::string> results = SplitStringUsingSubstr( + "alongwordwithnodelimiter", "DELIMITER", TRIM_WHITESPACE, + SPLIT_WANT_ALL); + ASSERT_EQ(1u, results.size()); + EXPECT_THAT(results, ElementsAre("alongwordwithnodelimiter")); +} + +TEST(SplitStringUsingSubstrTest, LeadingDelimitersSkipped) { + std::vector<std::string> results = SplitStringUsingSubstr( + "DELIMITERDELIMITERDELIMITERoneDELIMITERtwoDELIMITERthree", + "DELIMITER", TRIM_WHITESPACE, SPLIT_WANT_ALL); + ASSERT_EQ(6u, results.size()); + EXPECT_THAT(results, ElementsAre("", "", "", "one", "two", "three")); +} + +TEST(SplitStringUsingSubstrTest, ConsecutiveDelimitersSkipped) { + std::vector<std::string> results = SplitStringUsingSubstr( + "unoDELIMITERDELIMITERDELIMITERdosDELIMITERtresDELIMITERDELIMITERcuatro", + "DELIMITER", TRIM_WHITESPACE, SPLIT_WANT_ALL); + ASSERT_EQ(7u, results.size()); + EXPECT_THAT(results, ElementsAre("uno", "", "", "dos", "tres", "", "cuatro")); +} + +TEST(SplitStringUsingSubstrTest, TrailingDelimitersSkipped) { + std::vector<std::string> results = SplitStringUsingSubstr( + "unDELIMITERdeuxDELIMITERtroisDELIMITERquatreDELIMITERDELIMITERDELIMITER", + "DELIMITER", TRIM_WHITESPACE, SPLIT_WANT_ALL); + ASSERT_EQ(7u, results.size()); + EXPECT_THAT( + results, ElementsAre("un", "deux", "trois", "quatre", "", "", "")); +} + +TEST(SplitStringPieceUsingSubstrTest, StringWithNoDelimiter) { + std::vector<gurl_base::StringPiece> results = + SplitStringPieceUsingSubstr("alongwordwithnodelimiter", "DELIMITER", + gurl_base::TRIM_WHITESPACE, gurl_base::SPLIT_WANT_ALL); + ASSERT_EQ(1u, results.size()); + EXPECT_THAT(results, ElementsAre("alongwordwithnodelimiter")); +} + +TEST(SplitStringPieceUsingSubstrTest, LeadingDelimitersSkipped) { + std::vector<gurl_base::StringPiece> results = SplitStringPieceUsingSubstr( + "DELIMITERDELIMITERDELIMITERoneDELIMITERtwoDELIMITERthree", "DELIMITER", + gurl_base::TRIM_WHITESPACE, gurl_base::SPLIT_WANT_ALL); + ASSERT_EQ(6u, results.size()); + EXPECT_THAT(results, ElementsAre("", "", "", "one", "two", "three")); +} + +TEST(SplitStringPieceUsingSubstrTest, ConsecutiveDelimitersSkipped) { + std::vector<gurl_base::StringPiece> results = SplitStringPieceUsingSubstr( + "unoDELIMITERDELIMITERDELIMITERdosDELIMITERtresDELIMITERDELIMITERcuatro", + "DELIMITER", gurl_base::TRIM_WHITESPACE, gurl_base::SPLIT_WANT_ALL); + ASSERT_EQ(7u, results.size()); + EXPECT_THAT(results, ElementsAre("uno", "", "", "dos", "tres", "", "cuatro")); +} + +TEST(SplitStringPieceUsingSubstrTest, TrailingDelimitersSkipped) { + std::vector<gurl_base::StringPiece> results = SplitStringPieceUsingSubstr( + "unDELIMITERdeuxDELIMITERtroisDELIMITERquatreDELIMITERDELIMITERDELIMITER", + "DELIMITER", gurl_base::TRIM_WHITESPACE, gurl_base::SPLIT_WANT_ALL); + ASSERT_EQ(7u, results.size()); + EXPECT_THAT(results, + ElementsAre("un", "deux", "trois", "quatre", "", "", "")); +} + +TEST(SplitStringPieceUsingSubstrTest, KeepWhitespace) { + std::vector<gurl_base::StringPiece> results = SplitStringPieceUsingSubstr( + "un DELIMITERdeux\tDELIMITERtrois\nDELIMITERquatre", "DELIMITER", + gurl_base::KEEP_WHITESPACE, gurl_base::SPLIT_WANT_ALL); + ASSERT_EQ(4u, results.size()); + EXPECT_THAT(results, ElementsAre("un ", "deux\t", "trois\n", "quatre")); +} + +TEST(SplitStringPieceUsingSubstrTest, TrimWhitespace) { + std::vector<gurl_base::StringPiece> results = SplitStringPieceUsingSubstr( + "un DELIMITERdeux\tDELIMITERtrois\nDELIMITERquatre", "DELIMITER", + gurl_base::TRIM_WHITESPACE, gurl_base::SPLIT_WANT_ALL); + ASSERT_EQ(4u, results.size()); + EXPECT_THAT(results, ElementsAre("un", "deux", "trois", "quatre")); +} + +TEST(SplitStringPieceUsingSubstrTest, SplitWantAll) { + std::vector<gurl_base::StringPiece> results = SplitStringPieceUsingSubstr( + "unDELIMITERdeuxDELIMITERtroisDELIMITERDELIMITER", "DELIMITER", + gurl_base::TRIM_WHITESPACE, gurl_base::SPLIT_WANT_ALL); + ASSERT_EQ(5u, results.size()); + EXPECT_THAT(results, ElementsAre("un", "deux", "trois", "", "")); +} + +TEST(SplitStringPieceUsingSubstrTest, SplitWantNonEmpty) { + std::vector<gurl_base::StringPiece> results = SplitStringPieceUsingSubstr( + "unDELIMITERdeuxDELIMITERtroisDELIMITERDELIMITER", "DELIMITER", + gurl_base::TRIM_WHITESPACE, gurl_base::SPLIT_WANT_NONEMPTY); + ASSERT_EQ(3u, results.size()); + EXPECT_THAT(results, ElementsAre("un", "deux", "trois")); +} + +TEST(StringSplitTest, StringSplitKeepWhitespace) { + std::vector<std::string> r; + + r = SplitString(" ", "*", gurl_base::KEEP_WHITESPACE, gurl_base::SPLIT_WANT_ALL); + ASSERT_EQ(1U, r.size()); + EXPECT_EQ(r[0], " "); + + r = SplitString("\t \ta\t ", "\t", gurl_base::KEEP_WHITESPACE, + gurl_base::SPLIT_WANT_ALL); + ASSERT_EQ(4U, r.size()); + EXPECT_EQ(r[0], ""); + EXPECT_EQ(r[1], " "); + EXPECT_EQ(r[2], "a"); + EXPECT_EQ(r[3], " "); + + r = SplitString("\ta\t\nb\tcc", "\n", gurl_base::KEEP_WHITESPACE, + gurl_base::SPLIT_WANT_ALL); + ASSERT_EQ(2U, r.size()); + EXPECT_EQ(r[0], "\ta\t"); + EXPECT_EQ(r[1], "b\tcc"); +} + +TEST(StringSplitTest, SplitStringAlongWhitespace) { + struct TestData { + const char* input; + const size_t expected_result_count; + const char* output1; + const char* output2; + } data[] = { + { "a", 1, "a", "" }, + { " ", 0, "", "" }, + { " a", 1, "a", "" }, + { " ab ", 1, "ab", "" }, + { " ab c", 2, "ab", "c" }, + { " ab c ", 2, "ab", "c" }, + { " ab cd", 2, "ab", "cd" }, + { " ab cd ", 2, "ab", "cd" }, + { " \ta\t", 1, "a", "" }, + { " b\ta\t", 2, "b", "a" }, + { " b\tat", 2, "b", "at" }, + { "b\tat", 2, "b", "at" }, + { "b\t at", 2, "b", "at" }, + }; + for (const auto& i : data) { + std::vector<std::string> results = + gurl_base::SplitString(i.input, kWhitespaceASCII, gurl_base::KEEP_WHITESPACE, + gurl_base::SPLIT_WANT_NONEMPTY); + ASSERT_EQ(i.expected_result_count, results.size()); + if (i.expected_result_count > 0) + ASSERT_EQ(i.output1, results[0]); + if (i.expected_result_count > 1) + ASSERT_EQ(i.output2, results[1]); + } +} + +} // namespace base
diff --git a/base/strings/string_tokenizer.h b/base/strings/string_tokenizer.h new file mode 100644 index 0000000..7ee0178 --- /dev/null +++ b/base/strings/string_tokenizer.h
@@ -0,0 +1,303 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_STRING_TOKENIZER_H_ +#define BASE_STRINGS_STRING_TOKENIZER_H_ + +#include <algorithm> +#include <string> + +#include "base/strings/string_piece.h" + +namespace gurl_base { + +// StringTokenizerT is a simple string tokenizer class. It works like an +// iterator that with each step (see the Advance method) updates members that +// refer to the next token in the input string. The user may optionally +// configure the tokenizer to return delimiters. +// +// EXAMPLE 1: +// +// char input[] = "this is a test"; +// CStringTokenizer t(input, input + strlen(input), " "); +// while (t.GetNext()) { +// printf("%s\n", t.token().c_str()); +// } +// +// Output: +// +// this +// is +// a +// test +// +// +// EXAMPLE 2: +// +// std::string input = "no-cache=\"foo, bar\", private"; +// StringTokenizer t(input, ", "); +// t.set_quote_chars("\""); +// while (t.GetNext()) { +// printf("%s\n", t.token().c_str()); +// } +// +// Output: +// +// no-cache="foo, bar" +// private +// +// +// EXAMPLE 3: +// +// bool next_is_option = false, next_is_value = false; +// std::string input = "text/html; charset=UTF-8; foo=bar"; +// StringTokenizer t(input, "; ="); +// t.set_options(StringTokenizer::RETURN_DELIMS); +// while (t.GetNext()) { +// if (t.token_is_delim()) { +// switch (*t.token_begin()) { +// case ';': +// next_is_option = true; +// break; +// case '=': +// next_is_value = true; +// break; +// } +// } else { +// const char* label; +// if (next_is_option) { +// label = "option-name"; +// next_is_option = false; +// } else if (next_is_value) { +// label = "option-value"; +// next_is_value = false; +// } else { +// label = "mime-type"; +// } +// printf("%s: %s\n", label, t.token().c_str()); +// } +// } +// +// +template <class str, class const_iterator> +class StringTokenizerT { + public: + typedef typename str::value_type char_type; + + // Options that may be pass to set_options() + enum { + // Specifies the delimiters should be returned as tokens + RETURN_DELIMS = 1 << 0, + + // Specifies that empty tokens should be returned. Treats the beginning and + // ending of the string as implicit delimiters, though doesn't return them + // as tokens if RETURN_DELIMS is also used. + RETURN_EMPTY_TOKENS = 1 << 1, + }; + + // The string object must live longer than the tokenizer. In particular, this + // should not be constructed with a temporary. The deleted rvalue constructor + // blocks the most obvious instances of this (e.g. passing a string literal to + // the constructor), but caution must still be exercised. + StringTokenizerT(const str& string, + const str& delims) { + Init(string.begin(), string.end(), delims); + } + + // Don't allow temporary strings to be used with string tokenizer, since + // Init() would otherwise save iterators to a temporary string. + StringTokenizerT(str&&, const str& delims) = delete; + + StringTokenizerT(const_iterator string_begin, + const_iterator string_end, + const str& delims) { + Init(string_begin, string_end, delims); + } + + // Set the options for this tokenizer. By default, this is 0. + void set_options(int options) { options_ = options; } + + // Set the characters to regard as quotes. By default, this is empty. When + // a quote char is encountered, the tokenizer will switch into a mode where + // it ignores delimiters that it finds. It switches out of this mode once it + // finds another instance of the quote char. If a backslash is encountered + // within a quoted string, then the next character is skipped. + void set_quote_chars(const str& quotes) { quotes_ = quotes; } + + // Call this method to advance the tokenizer to the next delimiter. This + // returns false if the tokenizer is complete. This method must be called + // before calling any of the token* methods. + bool GetNext() { + if (quotes_.empty() && options_ == 0) + return QuickGetNext(); + else + return FullGetNext(); + } + + // Start iterating through tokens from the beginning of the string. + void Reset() { + token_end_ = start_pos_; + } + + // Returns true if token is a delimiter. When the tokenizer is constructed + // with the RETURN_DELIMS option, this method can be used to check if the + // returned token is actually a delimiter. Returns true before the first + // time GetNext() has been called, and after GetNext() returns false. + bool token_is_delim() const { return token_is_delim_; } + + // If GetNext() returned true, then these methods may be used to read the + // value of the token. + const_iterator token_begin() const { return token_begin_; } + const_iterator token_end() const { return token_end_; } + str token() const { return str(token_begin_, token_end_); } + BasicStringPiece<str> token_piece() const { + return BasicStringPiece<str>(&*token_begin_, + std::distance(token_begin_, token_end_)); + } + + private: + void Init(const_iterator string_begin, + const_iterator string_end, + const str& delims) { + start_pos_ = string_begin; + token_begin_ = string_begin; + token_end_ = string_begin; + end_ = string_end; + delims_ = delims; + options_ = 0; + token_is_delim_ = true; + } + + // Implementation of GetNext() for when we have no quote characters. We have + // two separate implementations because AdvanceOne() is a hot spot in large + // text files with large tokens. + bool QuickGetNext() { + token_is_delim_ = false; + for (;;) { + token_begin_ = token_end_; + if (token_end_ == end_) { + token_is_delim_ = true; + return false; + } + ++token_end_; + if (delims_.find(*token_begin_) == str::npos) + break; + // else skip over delimiter. + } + while (token_end_ != end_ && delims_.find(*token_end_) == str::npos) + ++token_end_; + return true; + } + + // Implementation of GetNext() for when we have to take quotes into account. + bool FullGetNext() { + AdvanceState state; + + for (;;) { + if (token_is_delim_) { + // Last token was a delimiter. Note: This is also the case at the start. + // + // ... D T T T T D ... + // ^ ^ + // | | + // | |token_end_| : The next character to look at or |end_|. + // | + // |token_begin_| : Points to delimiter or |token_end_|. + // + // The next token is always a non-delimiting token. It could be empty, + // however. + token_is_delim_ = false; + token_begin_ = token_end_; + + // Slurp all non-delimiter characters into the token. + while (token_end_ != end_ && AdvanceOne(&state, *token_end_)) { + ++token_end_; + } + + // If it's non-empty, or empty tokens were requested, return the token. + if (token_begin_ != token_end_ || (options_ & RETURN_EMPTY_TOKENS)) + return true; + } + + GURL_DCHECK(!token_is_delim_); + // Last token was a regular token. + // + // ... T T T D T T ... + // ^ ^ + // | | + // | token_end_ : The next character to look at. Always one + // | char beyond the token boundary. + // | + // token_begin_ : Points to beginning of token. Note: token could + // be empty, in which case + // token_begin_ == token_end_. + // + // The next token is always a delimiter. It could be |end_| however, but + // |end_| is also an implicit delimiter. + token_is_delim_ = true; + token_begin_ = token_end_; + + if (token_end_ == end_) + return false; + + // Look at the delimiter. + ++token_end_; + if (options_ & RETURN_DELIMS) + return true; + } + + return false; + } + + bool IsDelim(char_type c) const { + return delims_.find(c) != str::npos; + } + + bool IsQuote(char_type c) const { + return quotes_.find(c) != str::npos; + } + + struct AdvanceState { + bool in_quote; + bool in_escape; + char_type quote_char; + AdvanceState() : in_quote(false), in_escape(false), quote_char('\0') {} + }; + + // Returns true if a delimiter was not hit. + bool AdvanceOne(AdvanceState* state, char_type c) { + if (state->in_quote) { + if (state->in_escape) { + state->in_escape = false; + } else if (c == '\\') { + state->in_escape = true; + } else if (c == state->quote_char) { + state->in_quote = false; + } + } else { + if (IsDelim(c)) + return false; + state->in_quote = IsQuote(state->quote_char = c); + } + return true; + } + + const_iterator start_pos_; + const_iterator token_begin_; + const_iterator token_end_; + const_iterator end_; + str delims_; + str quotes_; + int options_; + bool token_is_delim_; +}; + +typedef StringTokenizerT<std::string, std::string::const_iterator> + StringTokenizer; +typedef StringTokenizerT<string16, string16::const_iterator> String16Tokenizer; +typedef StringTokenizerT<std::string, const char*> CStringTokenizer; + +} // namespace base + +#endif // BASE_STRINGS_STRING_TOKENIZER_H_
diff --git a/base/strings/string_tokenizer_fuzzer.cc b/base/strings/string_tokenizer_fuzzer.cc new file mode 100644 index 0000000..3aaee7b --- /dev/null +++ b/base/strings/string_tokenizer_fuzzer.cc
@@ -0,0 +1,59 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <stddef.h> +#include <stdint.h> + +#include <string> + +#include "base/strings/string_tokenizer.h" + +void GetAllTokens(gurl_base::StringTokenizer& t) { + while (t.GetNext()) { + (void)t.token(); + } +} + +// Entry point for LibFuzzer. +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + uint8_t size_t_bytes = sizeof(size_t); + if (size < size_t_bytes + 1) { + return 0; + } + + // Calculate pattern size based on remaining bytes, otherwise fuzzing is + // inefficient with bailouts in most cases. + size_t pattern_size = + *reinterpret_cast<const size_t*>(data) % (size - size_t_bytes); + + std::string pattern(reinterpret_cast<const char*>(data + size_t_bytes), + pattern_size); + std::string input( + reinterpret_cast<const char*>(data + size_t_bytes + pattern_size), + size - pattern_size - size_t_bytes); + + // Allow quote_chars and options to be set. Otherwise full coverage + // won't be possible since IsQuote, FullGetNext and other functions + // won't be called. + for (bool return_delims : {false, true}) { + for (bool return_empty_strings : {false, true}) { + int options = 0; + if (return_delims) + options |= gurl_base::StringTokenizer::RETURN_DELIMS; + if (return_empty_strings) + options |= gurl_base::StringTokenizer::RETURN_EMPTY_TOKENS; + + gurl_base::StringTokenizer t(input, pattern); + t.set_options(options); + GetAllTokens(t); + + gurl_base::StringTokenizer t_quote(input, pattern); + t_quote.set_quote_chars("\""); + t_quote.set_options(options); + GetAllTokens(t_quote); + } + } + + return 0; +}
diff --git a/base/strings/string_tokenizer_unittest.cc b/base/strings/string_tokenizer_unittest.cc new file mode 100644 index 0000000..1665d5d --- /dev/null +++ b/base/strings/string_tokenizer_unittest.cc
@@ -0,0 +1,387 @@ +// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string_tokenizer.h" + +#include "testing/gtest/include/gtest/gtest.h" + +using std::string; + +namespace gurl_base { + +namespace { + +TEST(StringTokenizerTest, Simple) { + string input = "this is a test"; + StringTokenizer t(input, " "); + // The start of string, before returning any tokens, is considered a + // delimiter. + EXPECT_TRUE(t.token_is_delim()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("this", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("is", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("a", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("test", t.token()); + + EXPECT_FALSE(t.GetNext()); + // The end of string, after the last token tokens, is considered a delimiter. + EXPECT_TRUE(t.token_is_delim()); +} + +TEST(StringTokenizerTest, Reset) { + string input = "this is a test"; + StringTokenizer t(input, " "); + + for (int i = 0; i < 2; ++i) { + EXPECT_TRUE(t.token_is_delim()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("this", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("is", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("a", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("test", t.token()); + + EXPECT_FALSE(t.GetNext()); + EXPECT_TRUE(t.token_is_delim()); + + t.Reset(); + } +} + +TEST(StringTokenizerTest, RetDelims) { + string input = "this is a test"; + StringTokenizer t(input, " "); + t.set_options(StringTokenizer::RETURN_DELIMS); + EXPECT_TRUE(t.token_is_delim()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("this", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_TRUE(t.token_is_delim()); + EXPECT_EQ(" ", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("is", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_TRUE(t.token_is_delim()); + EXPECT_EQ(" ", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("a", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_TRUE(t.token_is_delim()); + EXPECT_EQ(" ", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("test", t.token()); + + EXPECT_FALSE(t.GetNext()); + EXPECT_TRUE(t.token_is_delim()); +} + +TEST(StringTokenizerTest, RetEmptyTokens) { + string input = "foo='a, b',,bar,,baz,quux"; + StringTokenizer t(input, ","); + t.set_options(StringTokenizer::RETURN_EMPTY_TOKENS); + t.set_quote_chars("'"); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("foo='a, b'", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("bar", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("baz", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("quux", t.token()); + + EXPECT_FALSE(t.GetNext()); +} + +TEST(StringTokenizerTest, RetEmptyTokens_AtStart) { + string input = ",bar"; + StringTokenizer t(input, ","); + t.set_options(StringTokenizer::RETURN_EMPTY_TOKENS); + t.set_quote_chars("'"); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("bar", t.token()); + + EXPECT_FALSE(t.GetNext()); +} + +TEST(StringTokenizerTest, RetEmptyTokens_AtEnd) { + string input = "bar,"; + StringTokenizer t(input, ","); + t.set_options(StringTokenizer::RETURN_EMPTY_TOKENS); + t.set_quote_chars("'"); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("bar", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("", t.token()); + + EXPECT_FALSE(t.GetNext()); +} + +TEST(StringTokenizerTest, RetEmptyTokens_Both) { + string input = ","; + StringTokenizer t(input, ","); + t.set_options(StringTokenizer::RETURN_EMPTY_TOKENS); + t.set_quote_chars("'"); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("", t.token()); + + EXPECT_FALSE(t.GetNext()); +} + +TEST(StringTokenizerTest, RetEmptyTokens_Empty) { + string input = ""; + StringTokenizer t(input, ","); + t.set_options(StringTokenizer::RETURN_EMPTY_TOKENS); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("", t.token()); + + EXPECT_FALSE(t.GetNext()); +} + +TEST(StringTokenizerTest, RetDelimsAndEmptyTokens) { + string input = "foo='a, b',,bar,,baz,quux"; + StringTokenizer t(input, ","); + t.set_options(StringTokenizer::RETURN_DELIMS | + StringTokenizer::RETURN_EMPTY_TOKENS); + t.set_quote_chars("'"); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("foo='a, b'", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ(",", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ(",", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("bar", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ(",", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ(",", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("baz", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ(",", t.token()); + + ASSERT_TRUE(t.GetNext()); + EXPECT_EQ("quux", t.token()); + + EXPECT_FALSE(t.GetNext()); +} + +TEST(StringTokenizerTest, ManyDelims) { + string input = "this: is, a-test"; + StringTokenizer t(input, ": ,-"); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("this", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("is", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("a", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("test", t.token()); + + EXPECT_FALSE(t.GetNext()); +} + +TEST(StringTokenizerTest, ParseHeader) { + string input = "Content-Type: text/html ; charset=UTF-8"; + StringTokenizer t(input, ": ;="); + t.set_options(StringTokenizer::RETURN_DELIMS); + EXPECT_TRUE(t.token_is_delim()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("Content-Type", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_TRUE(t.token_is_delim()); + EXPECT_EQ(":", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_TRUE(t.token_is_delim()); + EXPECT_EQ(" ", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("text/html", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_TRUE(t.token_is_delim()); + EXPECT_EQ(" ", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_TRUE(t.token_is_delim()); + EXPECT_EQ(";", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_TRUE(t.token_is_delim()); + EXPECT_EQ(" ", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("charset", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_TRUE(t.token_is_delim()); + EXPECT_EQ("=", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_FALSE(t.token_is_delim()); + EXPECT_EQ("UTF-8", t.token()); + + EXPECT_FALSE(t.GetNext()); + EXPECT_TRUE(t.token_is_delim()); +} + +TEST(StringTokenizerTest, ParseQuotedString) { + string input = "foo bar 'hello world' baz"; + StringTokenizer t(input, " "); + t.set_quote_chars("'"); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("foo", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("bar", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("'hello world'", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("baz", t.token()); + + EXPECT_FALSE(t.GetNext()); +} + +TEST(StringTokenizerTest, ParseQuotedString_Malformed) { + string input = "bar 'hello wo"; + StringTokenizer t(input, " "); + t.set_quote_chars("'"); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("bar", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("'hello wo", t.token()); + + EXPECT_FALSE(t.GetNext()); +} + +TEST(StringTokenizerTest, ParseQuotedString_Multiple) { + string input = "bar 'hel\"lo\" wo' baz\""; + StringTokenizer t(input, " "); + t.set_quote_chars("'\""); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("bar", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("'hel\"lo\" wo'", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("baz\"", t.token()); + + EXPECT_FALSE(t.GetNext()); +} + +TEST(StringTokenizerTest, ParseQuotedString_EscapedQuotes) { + string input = "foo 'don\\'t do that'"; + StringTokenizer t(input, " "); + t.set_quote_chars("'"); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("foo", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("'don\\'t do that'", t.token()); + + EXPECT_FALSE(t.GetNext()); +} + +TEST(StringTokenizerTest, ParseQuotedString_EscapedQuotes2) { + string input = "foo='a, b', bar"; + StringTokenizer t(input, ", "); + t.set_quote_chars("'"); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("foo='a, b'", t.token()); + + EXPECT_TRUE(t.GetNext()); + EXPECT_EQ("bar", t.token()); + + EXPECT_FALSE(t.GetNext()); +} + +} // namespace + +} // namespace base
diff --git a/base/strings/string_util.cc b/base/strings/string_util.cc new file mode 100644 index 0000000..2b2591d --- /dev/null +++ b/base/strings/string_util.cc
@@ -0,0 +1,1119 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string_util.h" + +#include <ctype.h> +#include <errno.h> +#include <math.h> +#include <stdarg.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <wchar.h> +#include <wctype.h> + +#include <algorithm> +#include <limits> +#include <vector> + +#include "polyfills/base/logging.h" +#include "base/no_destructor.h" +#include "base/stl_util.h" +#include "base/strings/utf_string_conversion_utils.h" +#include "base/strings/utf_string_conversions.h" +#include "base/third_party/icu/icu_utf.h" +#include "build/build_config.h" + +namespace gurl_base { + +namespace { + +// Used by ReplaceStringPlaceholders to track the position in the string of +// replaced parameters. +struct ReplacementOffset { + ReplacementOffset(uintptr_t parameter, size_t offset) + : parameter(parameter), + offset(offset) {} + + // Index of the parameter. + uintptr_t parameter; + + // Starting position in the string. + size_t offset; +}; + +static bool CompareParameter(const ReplacementOffset& elem1, + const ReplacementOffset& elem2) { + return elem1.parameter < elem2.parameter; +} + +// Overloaded function to append one string onto the end of another. Having a +// separate overload for |source| as both string and StringPiece allows for more +// efficient usage from functions templated to work with either type (avoiding a +// redundant call to the BasicStringPiece constructor in both cases). +template <typename string_type> +inline void AppendToString(string_type* target, const string_type& source) { + target->append(source); +} + +template <typename string_type> +inline void AppendToString(string_type* target, + const BasicStringPiece<string_type>& source) { + source.AppendToString(target); +} + +// Assuming that a pointer is the size of a "machine word", then +// uintptr_t is an integer type that is also a machine word. +using MachineWord = uintptr_t; + +inline bool IsMachineWordAligned(const void* pointer) { + return !(reinterpret_cast<MachineWord>(pointer) & (sizeof(MachineWord) - 1)); +} + +template <typename CharacterType> +struct NonASCIIMask; +template <> +struct NonASCIIMask<char> { + static constexpr MachineWord value() { + return static_cast<MachineWord>(0x8080808080808080ULL); + } +}; +template <> +struct NonASCIIMask<char16> { + static constexpr MachineWord value() { + return static_cast<MachineWord>(0xFF80FF80FF80FF80ULL); + } +}; +#if defined(WCHAR_T_IS_UTF32) +template <> +struct NonASCIIMask<wchar_t> { + static constexpr MachineWord value() { + return static_cast<MachineWord>(0xFFFFFF80FFFFFF80ULL); + } +}; +#endif // WCHAR_T_IS_UTF32 + +} // namespace + +bool IsWprintfFormatPortable(const wchar_t* format) { + for (const wchar_t* position = format; *position != '\0'; ++position) { + if (*position == '%') { + bool in_specification = true; + bool modifier_l = false; + while (in_specification) { + // Eat up characters until reaching a known specifier. + if (*++position == '\0') { + // The format string ended in the middle of a specification. Call + // it portable because no unportable specifications were found. The + // string is equally broken on all platforms. + return true; + } + + if (*position == 'l') { + // 'l' is the only thing that can save the 's' and 'c' specifiers. + modifier_l = true; + } else if (((*position == 's' || *position == 'c') && !modifier_l) || + *position == 'S' || *position == 'C' || *position == 'F' || + *position == 'D' || *position == 'O' || *position == 'U') { + // Not portable. + return false; + } + + if (wcschr(L"diouxXeEfgGaAcspn%", *position)) { + // Portable, keep scanning the rest of the format string. + in_specification = false; + } + } + } + } + + return true; +} + +namespace { + +template<typename StringType> +StringType ToLowerASCIIImpl(BasicStringPiece<StringType> str) { + StringType ret; + ret.reserve(str.size()); + for (size_t i = 0; i < str.size(); i++) + ret.push_back(ToLowerASCII(str[i])); + return ret; +} + +template<typename StringType> +StringType ToUpperASCIIImpl(BasicStringPiece<StringType> str) { + StringType ret; + ret.reserve(str.size()); + for (size_t i = 0; i < str.size(); i++) + ret.push_back(ToUpperASCII(str[i])); + return ret; +} + +} // namespace + +std::string ToLowerASCII(StringPiece str) { + return ToLowerASCIIImpl<std::string>(str); +} + +string16 ToLowerASCII(StringPiece16 str) { + return ToLowerASCIIImpl<string16>(str); +} + +std::string ToUpperASCII(StringPiece str) { + return ToUpperASCIIImpl<std::string>(str); +} + +string16 ToUpperASCII(StringPiece16 str) { + return ToUpperASCIIImpl<string16>(str); +} + +template<class StringType> +int CompareCaseInsensitiveASCIIT(BasicStringPiece<StringType> a, + BasicStringPiece<StringType> b) { + // Find the first characters that aren't equal and compare them. If the end + // of one of the strings is found before a nonequal character, the lengths + // of the strings are compared. + size_t i = 0; + while (i < a.length() && i < b.length()) { + typename StringType::value_type lower_a = ToLowerASCII(a[i]); + typename StringType::value_type lower_b = ToLowerASCII(b[i]); + if (lower_a < lower_b) + return -1; + if (lower_a > lower_b) + return 1; + i++; + } + + // End of one string hit before finding a different character. Expect the + // common case to be "strings equal" at this point so check that first. + if (a.length() == b.length()) + return 0; + + if (a.length() < b.length()) + return -1; + return 1; +} + +int CompareCaseInsensitiveASCII(StringPiece a, StringPiece b) { + return CompareCaseInsensitiveASCIIT<std::string>(a, b); +} + +int CompareCaseInsensitiveASCII(StringPiece16 a, StringPiece16 b) { + return CompareCaseInsensitiveASCIIT<string16>(a, b); +} + +bool EqualsCaseInsensitiveASCII(StringPiece a, StringPiece b) { + if (a.length() != b.length()) + return false; + return CompareCaseInsensitiveASCIIT<std::string>(a, b) == 0; +} + +bool EqualsCaseInsensitiveASCII(StringPiece16 a, StringPiece16 b) { + if (a.length() != b.length()) + return false; + return CompareCaseInsensitiveASCIIT<string16>(a, b) == 0; +} + +const std::string& EmptyString() { + static const gurl_base::NoDestructor<std::string> s; + return *s; +} + +const string16& EmptyString16() { + static const gurl_base::NoDestructor<string16> s16; + return *s16; +} + +template <class StringType> +bool ReplaceCharsT(const StringType& input, + BasicStringPiece<StringType> find_any_of_these, + BasicStringPiece<StringType> replace_with, + StringType* output); + +bool ReplaceChars(const string16& input, + StringPiece16 replace_chars, + const string16& replace_with, + string16* output) { + return ReplaceCharsT(input, replace_chars, StringPiece16(replace_with), + output); +} + +bool ReplaceChars(const std::string& input, + StringPiece replace_chars, + const std::string& replace_with, + std::string* output) { + return ReplaceCharsT(input, replace_chars, StringPiece(replace_with), output); +} + +bool RemoveChars(const string16& input, + StringPiece16 remove_chars, + string16* output) { + return ReplaceCharsT(input, remove_chars, StringPiece16(), output); +} + +bool RemoveChars(const std::string& input, + StringPiece remove_chars, + std::string* output) { + return ReplaceCharsT(input, remove_chars, StringPiece(), output); +} + +template<typename Str> +TrimPositions TrimStringT(const Str& input, + BasicStringPiece<Str> trim_chars, + TrimPositions positions, + Str* output) { + // Find the edges of leading/trailing whitespace as desired. Need to use + // a StringPiece version of input to be able to call find* on it with the + // StringPiece version of trim_chars (normally the trim_chars will be a + // constant so avoid making a copy). + BasicStringPiece<Str> input_piece(input); + const size_t last_char = input.length() - 1; + const size_t first_good_char = (positions & TRIM_LEADING) ? + input_piece.find_first_not_of(trim_chars) : 0; + const size_t last_good_char = (positions & TRIM_TRAILING) ? + input_piece.find_last_not_of(trim_chars) : last_char; + + // When the string was all trimmed, report that we stripped off characters + // from whichever position the caller was interested in. For empty input, we + // stripped no characters, but we still need to clear |output|. + if (input.empty() || + (first_good_char == Str::npos) || (last_good_char == Str::npos)) { + bool input_was_empty = input.empty(); // in case output == &input + output->clear(); + return input_was_empty ? TRIM_NONE : positions; + } + + // Trim. + *output = + input.substr(first_good_char, last_good_char - first_good_char + 1); + + // Return where we trimmed from. + return static_cast<TrimPositions>( + ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) | + ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING)); +} + +bool TrimString(const string16& input, + StringPiece16 trim_chars, + string16* output) { + return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; +} + +bool TrimString(const std::string& input, + StringPiece trim_chars, + std::string* output) { + return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; +} + +template<typename Str> +BasicStringPiece<Str> TrimStringPieceT(BasicStringPiece<Str> input, + BasicStringPiece<Str> trim_chars, + TrimPositions positions) { + size_t begin = (positions & TRIM_LEADING) ? + input.find_first_not_of(trim_chars) : 0; + size_t end = (positions & TRIM_TRAILING) ? + input.find_last_not_of(trim_chars) + 1 : input.size(); + return input.substr(begin, end - begin); +} + +StringPiece16 TrimString(StringPiece16 input, + StringPiece16 trim_chars, + TrimPositions positions) { + return TrimStringPieceT(input, trim_chars, positions); +} + +StringPiece TrimString(StringPiece input, + StringPiece trim_chars, + TrimPositions positions) { + return TrimStringPieceT(input, trim_chars, positions); +} + +void TruncateUTF8ToByteSize(const std::string& input, + const size_t byte_size, + std::string* output) { + GURL_DCHECK(output); + if (byte_size > input.length()) { + *output = input; + return; + } + GURL_DCHECK_LE(byte_size, + static_cast<uint32_t>(std::numeric_limits<int32_t>::max())); + // Note: This cast is necessary because CBU8_NEXT uses int32_ts. + int32_t truncation_length = static_cast<int32_t>(byte_size); + int32_t char_index = truncation_length - 1; + const char* data = input.data(); + + // Using CBU8, we will move backwards from the truncation point + // to the beginning of the string looking for a valid UTF8 + // character. Once a full UTF8 character is found, we will + // truncate the string to the end of that character. + while (char_index >= 0) { + int32_t prev = char_index; + base_icu::UChar32 code_point = 0; + CBU8_NEXT(data, char_index, truncation_length, code_point); + if (!IsValidCharacter(code_point) || + !IsValidCodepoint(code_point)) { + char_index = prev - 1; + } else { + break; + } + } + + if (char_index >= 0 ) + *output = input.substr(0, char_index); + else + output->clear(); +} + +TrimPositions TrimWhitespace(const string16& input, + TrimPositions positions, + string16* output) { + return TrimStringT(input, StringPiece16(kWhitespaceUTF16), positions, output); +} + +StringPiece16 TrimWhitespace(StringPiece16 input, + TrimPositions positions) { + return TrimStringPieceT(input, StringPiece16(kWhitespaceUTF16), positions); +} + +TrimPositions TrimWhitespaceASCII(const std::string& input, + TrimPositions positions, + std::string* output) { + return TrimStringT(input, StringPiece(kWhitespaceASCII), positions, output); +} + +StringPiece TrimWhitespaceASCII(StringPiece input, TrimPositions positions) { + return TrimStringPieceT(input, StringPiece(kWhitespaceASCII), positions); +} + +template<typename STR> +STR CollapseWhitespaceT(const STR& text, + bool trim_sequences_with_line_breaks) { + STR result; + result.resize(text.size()); + + // Set flags to pretend we're already in a trimmed whitespace sequence, so we + // will trim any leading whitespace. + bool in_whitespace = true; + bool already_trimmed = true; + + int chars_written = 0; + for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) { + if (IsUnicodeWhitespace(*i)) { + if (!in_whitespace) { + // Reduce all whitespace sequences to a single space. + in_whitespace = true; + result[chars_written++] = L' '; + } + if (trim_sequences_with_line_breaks && !already_trimmed && + ((*i == '\n') || (*i == '\r'))) { + // Whitespace sequences containing CR or LF are eliminated entirely. + already_trimmed = true; + --chars_written; + } + } else { + // Non-whitespace chracters are copied straight across. + in_whitespace = false; + already_trimmed = false; + result[chars_written++] = *i; + } + } + + if (in_whitespace && !already_trimmed) { + // Any trailing whitespace is eliminated. + --chars_written; + } + + result.resize(chars_written); + return result; +} + +string16 CollapseWhitespace(const string16& text, + bool trim_sequences_with_line_breaks) { + return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); +} + +std::string CollapseWhitespaceASCII(const std::string& text, + bool trim_sequences_with_line_breaks) { + return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); +} + +bool ContainsOnlyChars(StringPiece input, StringPiece characters) { + return input.find_first_not_of(characters) == StringPiece::npos; +} + +bool ContainsOnlyChars(StringPiece16 input, StringPiece16 characters) { + return input.find_first_not_of(characters) == StringPiece16::npos; +} + +template <class Char> +inline bool DoIsStringASCII(const Char* characters, size_t length) { + if (!length) + return true; + constexpr MachineWord non_ascii_bit_mask = NonASCIIMask<Char>::value(); + MachineWord all_char_bits = 0; + const Char* end = characters + length; + + // Prologue: align the input. + while (!IsMachineWordAligned(characters) && characters < end) + all_char_bits |= *characters++; + if (all_char_bits & non_ascii_bit_mask) + return false; + + // Compare the values of CPU word size. + constexpr size_t chars_per_word = sizeof(MachineWord) / sizeof(Char); + constexpr int batch_count = 16; + while (characters <= end - batch_count * chars_per_word) { + all_char_bits = 0; + for (int i = 0; i < batch_count; ++i) { + all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters)); + characters += chars_per_word; + } + if (all_char_bits & non_ascii_bit_mask) + return false; + } + + // Process the remaining words. + all_char_bits = 0; + while (characters <= end - chars_per_word) { + all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters)); + characters += chars_per_word; + } + + // Process the remaining bytes. + while (characters < end) + all_char_bits |= *characters++; + + return !(all_char_bits & non_ascii_bit_mask); +} + +bool IsStringASCII(StringPiece str) { + return DoIsStringASCII(str.data(), str.length()); +} + +bool IsStringASCII(StringPiece16 str) { + return DoIsStringASCII(str.data(), str.length()); +} + +#if defined(WCHAR_T_IS_UTF32) +bool IsStringASCII(WStringPiece str) { + return DoIsStringASCII(str.data(), str.length()); +} +#endif + +bool IsStringUTF8(StringPiece str) { + const char *src = str.data(); + int32_t src_len = static_cast<int32_t>(str.length()); + int32_t char_index = 0; + + while (char_index < src_len) { + int32_t code_point; + CBU8_NEXT(src, char_index, src_len, code_point); + if (!IsValidCharacter(code_point)) + return false; + } + return true; +} + +// Implementation note: Normally this function will be called with a hardcoded +// constant for the lowercase_ascii parameter. Constructing a StringPiece from +// a C constant requires running strlen, so the result will be two passes +// through the buffers, one to file the length of lowercase_ascii, and one to +// compare each letter. +// +// This function could have taken a const char* to avoid this and only do one +// pass through the string. But the strlen is faster than the case-insensitive +// compares and lets us early-exit in the case that the strings are different +// lengths (will often be the case for non-matches). So whether one approach or +// the other will be faster depends on the case. +// +// The hardcoded strings are typically very short so it doesn't matter, and the +// string piece gives additional flexibility for the caller (doesn't have to be +// null terminated) so we choose the StringPiece route. +template<typename Str> +static inline bool DoLowerCaseEqualsASCII(BasicStringPiece<Str> str, + StringPiece lowercase_ascii) { + if (str.size() != lowercase_ascii.size()) + return false; + for (size_t i = 0; i < str.size(); i++) { + if (ToLowerASCII(str[i]) != lowercase_ascii[i]) + return false; + } + return true; +} + +bool LowerCaseEqualsASCII(StringPiece str, StringPiece lowercase_ascii) { + return DoLowerCaseEqualsASCII<std::string>(str, lowercase_ascii); +} + +bool LowerCaseEqualsASCII(StringPiece16 str, StringPiece lowercase_ascii) { + return DoLowerCaseEqualsASCII<string16>(str, lowercase_ascii); +} + +bool EqualsASCII(StringPiece16 str, StringPiece ascii) { + if (str.length() != ascii.length()) + return false; + return std::equal(ascii.begin(), ascii.end(), str.begin()); +} + +template<typename Str> +bool StartsWithT(BasicStringPiece<Str> str, + BasicStringPiece<Str> search_for, + CompareCase case_sensitivity) { + if (search_for.size() > str.size()) + return false; + + BasicStringPiece<Str> source = str.substr(0, search_for.size()); + + switch (case_sensitivity) { + case CompareCase::SENSITIVE: + return source == search_for; + + case CompareCase::INSENSITIVE_ASCII: + return std::equal( + search_for.begin(), search_for.end(), + source.begin(), + CaseInsensitiveCompareASCII<typename Str::value_type>()); + + default: + GURL_NOTREACHED(); + return false; + } +} + +bool StartsWith(StringPiece str, + StringPiece search_for, + CompareCase case_sensitivity) { + return StartsWithT<std::string>(str, search_for, case_sensitivity); +} + +bool StartsWith(StringPiece16 str, + StringPiece16 search_for, + CompareCase case_sensitivity) { + return StartsWithT<string16>(str, search_for, case_sensitivity); +} + +template <typename Str> +bool EndsWithT(BasicStringPiece<Str> str, + BasicStringPiece<Str> search_for, + CompareCase case_sensitivity) { + if (search_for.size() > str.size()) + return false; + + BasicStringPiece<Str> source = str.substr(str.size() - search_for.size(), + search_for.size()); + + switch (case_sensitivity) { + case CompareCase::SENSITIVE: + return source == search_for; + + case CompareCase::INSENSITIVE_ASCII: + return std::equal( + source.begin(), source.end(), + search_for.begin(), + CaseInsensitiveCompareASCII<typename Str::value_type>()); + + default: + GURL_NOTREACHED(); + return false; + } +} + +bool EndsWith(StringPiece str, + StringPiece search_for, + CompareCase case_sensitivity) { + return EndsWithT<std::string>(str, search_for, case_sensitivity); +} + +bool EndsWith(StringPiece16 str, + StringPiece16 search_for, + CompareCase case_sensitivity) { + return EndsWithT<string16>(str, search_for, case_sensitivity); +} + +char HexDigitToInt(wchar_t c) { + GURL_DCHECK(IsHexDigit(c)); + if (c >= '0' && c <= '9') + return static_cast<char>(c - '0'); + if (c >= 'A' && c <= 'F') + return static_cast<char>(c - 'A' + 10); + if (c >= 'a' && c <= 'f') + return static_cast<char>(c - 'a' + 10); + return 0; +} + +bool IsUnicodeWhitespace(wchar_t c) { + // kWhitespaceWide is a NULL-terminated string + for (const wchar_t* cur = kWhitespaceWide; *cur; ++cur) { + if (*cur == c) + return true; + } + return false; +} + +static const char* const kByteStringsUnlocalized[] = { + " B", + " kB", + " MB", + " GB", + " TB", + " PB" +}; + +string16 FormatBytesUnlocalized(int64_t bytes) { + double unit_amount = static_cast<double>(bytes); + size_t dimension = 0; + const int kKilo = 1024; + while (unit_amount >= kKilo && + dimension < gurl_base::size(kByteStringsUnlocalized) - 1) { + unit_amount /= kKilo; + dimension++; + } + + char buf[64]; + if (bytes != 0 && dimension > 0 && unit_amount < 100) { + gurl_base::snprintf(buf, gurl_base::size(buf), "%.1lf%s", unit_amount, + kByteStringsUnlocalized[dimension]); + } else { + gurl_base::snprintf(buf, gurl_base::size(buf), "%.0lf%s", unit_amount, + kByteStringsUnlocalized[dimension]); + } + + return ASCIIToUTF16(buf); +} + +// A Matcher for DoReplaceMatchesAfterOffset() that matches substrings. +template <class StringType> +struct SubstringMatcher { + BasicStringPiece<StringType> find_this; + + size_t Find(const StringType& input, size_t pos) { + return input.find(find_this.data(), pos, find_this.length()); + } + size_t MatchSize() { return find_this.length(); } +}; + +// A Matcher for DoReplaceMatchesAfterOffset() that matches single characters. +template <class StringType> +struct CharacterMatcher { + BasicStringPiece<StringType> find_any_of_these; + + size_t Find(const StringType& input, size_t pos) { + return input.find_first_of(find_any_of_these.data(), pos, + find_any_of_these.length()); + } + constexpr size_t MatchSize() { return 1; } +}; + +enum class ReplaceType { REPLACE_ALL, REPLACE_FIRST }; + +// Runs in O(n) time in the length of |str|, and transforms the string without +// reallocating when possible. Returns |true| if any matches were found. +// +// This is parameterized on a |Matcher| traits type, so that it can be the +// implementation for both ReplaceChars() and ReplaceSubstringsAfterOffset(). +template <class StringType, class Matcher> +bool DoReplaceMatchesAfterOffset(StringType* str, + size_t initial_offset, + Matcher matcher, + BasicStringPiece<StringType> replace_with, + ReplaceType replace_type) { + using CharTraits = typename StringType::traits_type; + + const size_t find_length = matcher.MatchSize(); + if (!find_length) + return false; + + // If the find string doesn't appear, there's nothing to do. + size_t first_match = matcher.Find(*str, initial_offset); + if (first_match == StringType::npos) + return false; + + // If we're only replacing one instance, there's no need to do anything + // complicated. + const size_t replace_length = replace_with.length(); + if (replace_type == ReplaceType::REPLACE_FIRST) { + str->replace(first_match, find_length, replace_with.data(), replace_length); + return true; + } + + // If the find and replace strings are the same length, we can simply use + // replace() on each instance, and finish the entire operation in O(n) time. + if (find_length == replace_length) { + auto* buffer = &((*str)[0]); + for (size_t offset = first_match; offset != StringType::npos; + offset = matcher.Find(*str, offset + replace_length)) { + CharTraits::copy(buffer + offset, replace_with.data(), replace_length); + } + return true; + } + + // Since the find and replace strings aren't the same length, a loop like the + // one above would be O(n^2) in the worst case, as replace() will shift the + // entire remaining string each time. We need to be more clever to keep things + // O(n). + // + // When the string is being shortened, it's possible to just shift the matches + // down in one pass while finding, and truncate the length at the end of the + // search. + // + // If the string is being lengthened, more work is required. The strategy used + // here is to make two find() passes through the string. The first pass counts + // the number of matches to determine the new size. The second pass will + // either construct the new string into a new buffer (if the existing buffer + // lacked capacity), or else -- if there is room -- create a region of scratch + // space after |first_match| by shifting the tail of the string to a higher + // index, and doing in-place moves from the tail to lower indices thereafter. + size_t str_length = str->length(); + size_t expansion = 0; + if (replace_length > find_length) { + // This operation lengthens the string; determine the new length by counting + // matches. + const size_t expansion_per_match = (replace_length - find_length); + size_t num_matches = 0; + for (size_t match = first_match; match != StringType::npos; + match = matcher.Find(*str, match + find_length)) { + expansion += expansion_per_match; + ++num_matches; + } + const size_t final_length = str_length + expansion; + + if (str->capacity() < final_length) { + // If we'd have to allocate a new buffer to grow the string, build the + // result directly into the new allocation via append(). + StringType src(str->get_allocator()); + str->swap(src); + str->reserve(final_length); + + size_t pos = 0; + for (size_t match = first_match;; match = matcher.Find(src, pos)) { + str->append(src, pos, match - pos); + str->append(replace_with.data(), replace_length); + pos = match + find_length; + + // A mid-loop test/break enables skipping the final Find() call; the + // number of matches is known, so don't search past the last one. + if (!--num_matches) + break; + } + + // Handle substring after the final match. + str->append(src, pos, str_length - pos); + return true; + } + + // Prepare for the copy/move loop below -- expand the string to its final + // size by shifting the data after the first match to the end of the resized + // string. + size_t shift_src = first_match + find_length; + size_t shift_dst = shift_src + expansion; + + // Big |expansion| factors (relative to |str_length|) require padding up to + // |shift_dst|. + if (shift_dst > str_length) + str->resize(shift_dst); + + str->replace(shift_dst, str_length - shift_src, *str, shift_src, + str_length - shift_src); + str_length = final_length; + } + + // We can alternate replacement and move operations. This won't overwrite the + // unsearched region of the string so long as |write_offset| <= |read_offset|; + // that condition is always satisfied because: + // + // (a) If the string is being shortened, |expansion| is zero and + // |write_offset| grows slower than |read_offset|. + // + // (b) If the string is being lengthened, |write_offset| grows faster than + // |read_offset|, but |expansion| is big enough so that |write_offset| + // will only catch up to |read_offset| at the point of the last match. + auto* buffer = &((*str)[0]); + size_t write_offset = first_match; + size_t read_offset = first_match + expansion; + do { + if (replace_length) { + CharTraits::copy(buffer + write_offset, replace_with.data(), + replace_length); + write_offset += replace_length; + } + read_offset += find_length; + + // min() clamps StringType::npos (the largest unsigned value) to str_length. + size_t match = std::min(matcher.Find(*str, read_offset), str_length); + + size_t length = match - read_offset; + if (length) { + CharTraits::move(buffer + write_offset, buffer + read_offset, length); + write_offset += length; + read_offset += length; + } + } while (read_offset < str_length); + + // If we're shortening the string, truncate it now. + str->resize(write_offset); + return true; +} + +template <class StringType> +bool ReplaceCharsT(const StringType& input, + BasicStringPiece<StringType> find_any_of_these, + BasicStringPiece<StringType> replace_with, + StringType* output) { + // Commonly, this is called with output and input being the same string; in + // that case, this assignment is inexpensive. + *output = input; + + return DoReplaceMatchesAfterOffset( + output, 0, CharacterMatcher<StringType>{find_any_of_these}, replace_with, + ReplaceType::REPLACE_ALL); +} + +void ReplaceFirstSubstringAfterOffset(string16* str, + size_t start_offset, + StringPiece16 find_this, + StringPiece16 replace_with) { + DoReplaceMatchesAfterOffset(str, start_offset, + SubstringMatcher<string16>{find_this}, + replace_with, ReplaceType::REPLACE_FIRST); +} + +void ReplaceFirstSubstringAfterOffset(std::string* str, + size_t start_offset, + StringPiece find_this, + StringPiece replace_with) { + DoReplaceMatchesAfterOffset(str, start_offset, + SubstringMatcher<std::string>{find_this}, + replace_with, ReplaceType::REPLACE_FIRST); +} + +void ReplaceSubstringsAfterOffset(string16* str, + size_t start_offset, + StringPiece16 find_this, + StringPiece16 replace_with) { + DoReplaceMatchesAfterOffset(str, start_offset, + SubstringMatcher<string16>{find_this}, + replace_with, ReplaceType::REPLACE_ALL); +} + +void ReplaceSubstringsAfterOffset(std::string* str, + size_t start_offset, + StringPiece find_this, + StringPiece replace_with) { + DoReplaceMatchesAfterOffset(str, start_offset, + SubstringMatcher<std::string>{find_this}, + replace_with, ReplaceType::REPLACE_ALL); +} + +template <class string_type> +inline typename string_type::value_type* WriteIntoT(string_type* str, + size_t length_with_null) { + GURL_DCHECK_GT(length_with_null, 1u); + str->reserve(length_with_null); + str->resize(length_with_null - 1); + return &((*str)[0]); +} + +char* WriteInto(std::string* str, size_t length_with_null) { + return WriteIntoT(str, length_with_null); +} + +char16* WriteInto(string16* str, size_t length_with_null) { + return WriteIntoT(str, length_with_null); +} + +#if defined(_MSC_VER) && !defined(__clang__) +// Work around VC++ code-gen bug. https://crbug.com/804884 +#pragma optimize("", off) +#endif + +// Generic version for all JoinString overloads. |list_type| must be a sequence +// (std::vector or std::initializer_list) of strings/StringPieces (std::string, +// string16, StringPiece or StringPiece16). |string_type| is either std::string +// or string16. +template <typename list_type, typename string_type> +static string_type JoinStringT(const list_type& parts, + BasicStringPiece<string_type> sep) { + if (parts.size() == 0) + return string_type(); + + // Pre-allocate the eventual size of the string. Start with the size of all of + // the separators (note that this *assumes* parts.size() > 0). + size_t total_size = (parts.size() - 1) * sep.size(); + for (const auto& part : parts) + total_size += part.size(); + string_type result; + result.reserve(total_size); + + auto iter = parts.begin(); + GURL_DCHECK(iter != parts.end()); + AppendToString(&result, *iter); + ++iter; + + for (; iter != parts.end(); ++iter) { + sep.AppendToString(&result); + // Using the overloaded AppendToString allows this template function to work + // on both strings and StringPieces without creating an intermediate + // StringPiece object. + AppendToString(&result, *iter); + } + + // Sanity-check that we pre-allocated correctly. + GURL_DCHECK_EQ(total_size, result.size()); + + return result; +} + +std::string JoinString(const std::vector<std::string>& parts, + StringPiece separator) { + return JoinStringT(parts, separator); +} + +string16 JoinString(const std::vector<string16>& parts, + StringPiece16 separator) { + return JoinStringT(parts, separator); +} + +#if defined(_MSC_VER) && !defined(__clang__) +// Work around VC++ code-gen bug. https://crbug.com/804884 +#pragma optimize("", on) +#endif + +std::string JoinString(const std::vector<StringPiece>& parts, + StringPiece separator) { + return JoinStringT(parts, separator); +} + +string16 JoinString(const std::vector<StringPiece16>& parts, + StringPiece16 separator) { + return JoinStringT(parts, separator); +} + +std::string JoinString(std::initializer_list<StringPiece> parts, + StringPiece separator) { + return JoinStringT(parts, separator); +} + +string16 JoinString(std::initializer_list<StringPiece16> parts, + StringPiece16 separator) { + return JoinStringT(parts, separator); +} + +template<class FormatStringType, class OutStringType> +OutStringType DoReplaceStringPlaceholders( + const FormatStringType& format_string, + const std::vector<OutStringType>& subst, + std::vector<size_t>* offsets) { + size_t substitutions = subst.size(); + GURL_DCHECK_LT(substitutions, 10U); + + size_t sub_length = 0; + for (const auto& cur : subst) + sub_length += cur.length(); + + OutStringType formatted; + formatted.reserve(format_string.length() + sub_length); + + std::vector<ReplacementOffset> r_offsets; + for (auto i = format_string.begin(); i != format_string.end(); ++i) { + if ('$' == *i) { + if (i + 1 != format_string.end()) { + ++i; + if ('$' == *i) { + while (i != format_string.end() && '$' == *i) { + formatted.push_back('$'); + ++i; + } + --i; + } else { + if (*i < '1' || *i > '9') { + GURL_DLOG(ERROR) << "Invalid placeholder: $" << *i; + continue; + } + uintptr_t index = *i - '1'; + if (offsets) { + ReplacementOffset r_offset(index, + static_cast<int>(formatted.size())); + r_offsets.insert( + std::upper_bound(r_offsets.begin(), r_offsets.end(), r_offset, + &CompareParameter), + r_offset); + } + if (index < substitutions) + formatted.append(subst.at(index)); + } + } + } else { + formatted.push_back(*i); + } + } + if (offsets) { + for (const auto& cur : r_offsets) + offsets->push_back(cur.offset); + } + return formatted; +} + +string16 ReplaceStringPlaceholders(const string16& format_string, + const std::vector<string16>& subst, + std::vector<size_t>* offsets) { + return DoReplaceStringPlaceholders(format_string, subst, offsets); +} + +std::string ReplaceStringPlaceholders(StringPiece format_string, + const std::vector<std::string>& subst, + std::vector<size_t>* offsets) { + return DoReplaceStringPlaceholders(format_string, subst, offsets); +} + +string16 ReplaceStringPlaceholders(const string16& format_string, + const string16& a, + size_t* offset) { + std::vector<size_t> offsets; + std::vector<string16> subst; + subst.push_back(a); + string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets); + + GURL_DCHECK_EQ(1U, offsets.size()); + if (offset) + *offset = offsets[0]; + return result; +} + +// The following code is compatible with the OpenBSD lcpy interface. See: +// http://www.gratisoft.us/todd/papers/strlcpy.html +// ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c + +namespace { + +template <typename CHAR> +size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) { + for (size_t i = 0; i < dst_size; ++i) { + if ((dst[i] = src[i]) == 0) // We hit and copied the terminating NULL. + return i; + } + + // We were left off at dst_size. We over copied 1 byte. Null terminate. + if (dst_size != 0) + dst[dst_size - 1] = 0; + + // Count the rest of the |src|, and return it's length in characters. + while (src[dst_size]) ++dst_size; + return dst_size; +} + +} // namespace + +size_t strlcpy(char* dst, const char* src, size_t dst_size) { + return lcpyT<char>(dst, src, dst_size); +} +size_t wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) { + return lcpyT<wchar_t>(dst, src, dst_size); +} + +} // namespace base
diff --git a/base/strings/string_util.h b/base/strings/string_util.h new file mode 100644 index 0000000..5a8cb02 --- /dev/null +++ b/base/strings/string_util.h
@@ -0,0 +1,530 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// This file defines utility functions for working with strings. + +#ifndef BASE_STRINGS_STRING_UTIL_H_ +#define BASE_STRINGS_STRING_UTIL_H_ + +#include <ctype.h> +#include <stdarg.h> // va_list +#include <stddef.h> +#include <stdint.h> + +#include <initializer_list> +#include <string> +#include <vector> + +#include "polyfills/base/base_export.h" +#include "base/compiler_specific.h" +#include "base/stl_util.h" +#include "base/strings/string16.h" +#include "base/strings/string_piece.h" // For implicit conversions. +#include "build/build_config.h" + +namespace gurl_base { + +// C standard-library functions that aren't cross-platform are provided as +// "gurl_base::...", and their prototypes are listed below. These functions are +// then implemented as inline calls to the platform-specific equivalents in the +// platform-specific headers. + +// Wrapper for vsnprintf that always null-terminates and always returns the +// number of characters that would be in an untruncated formatted +// string, even when truncation occurs. +int vsnprintf(char* buffer, size_t size, const char* format, va_list arguments) + PRINTF_FORMAT(3, 0); + +// Some of these implementations need to be inlined. + +// We separate the declaration from the implementation of this inline +// function just so the PRINTF_FORMAT works. +inline int snprintf(char* buffer, size_t size, const char* format, ...) + PRINTF_FORMAT(3, 4); +inline int snprintf(char* buffer, size_t size, const char* format, ...) { + va_list arguments; + va_start(arguments, format); + int result = vsnprintf(buffer, size, format, arguments); + va_end(arguments); + return result; +} + +// BSD-style safe and consistent string copy functions. +// Copies |src| to |dst|, where |dst_size| is the total allocated size of |dst|. +// Copies at most |dst_size|-1 characters, and always NULL terminates |dst|, as +// long as |dst_size| is not 0. Returns the length of |src| in characters. +// If the return value is >= dst_size, then the output was truncated. +// NOTE: All sizes are in number of characters, NOT in bytes. +BASE_EXPORT size_t strlcpy(char* dst, const char* src, size_t dst_size); +BASE_EXPORT size_t wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size); + +// Scan a wprintf format string to determine whether it's portable across a +// variety of systems. This function only checks that the conversion +// specifiers used by the format string are supported and have the same meaning +// on a variety of systems. It doesn't check for other errors that might occur +// within a format string. +// +// Nonportable conversion specifiers for wprintf are: +// - 's' and 'c' without an 'l' length modifier. %s and %c operate on char +// data on all systems except Windows, which treat them as wchar_t data. +// Use %ls and %lc for wchar_t data instead. +// - 'S' and 'C', which operate on wchar_t data on all systems except Windows, +// which treat them as char data. Use %ls and %lc for wchar_t data +// instead. +// - 'F', which is not identified by Windows wprintf documentation. +// - 'D', 'O', and 'U', which are deprecated and not available on all systems. +// Use %ld, %lo, and %lu instead. +// +// Note that there is no portable conversion specifier for char data when +// working with wprintf. +// +// This function is intended to be called from gurl_base::vswprintf. +BASE_EXPORT bool IsWprintfFormatPortable(const wchar_t* format); + +// ASCII-specific tolower. The standard library's tolower is locale sensitive, +// so we don't want to use it here. +inline char ToLowerASCII(char c) { + return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; +} +inline char16 ToLowerASCII(char16 c) { + return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; +} + +// ASCII-specific toupper. The standard library's toupper is locale sensitive, +// so we don't want to use it here. +inline char ToUpperASCII(char c) { + return (c >= 'a' && c <= 'z') ? (c + ('A' - 'a')) : c; +} +inline char16 ToUpperASCII(char16 c) { + return (c >= 'a' && c <= 'z') ? (c + ('A' - 'a')) : c; +} + +// Converts the given string to it's ASCII-lowercase equivalent. +BASE_EXPORT std::string ToLowerASCII(StringPiece str); +BASE_EXPORT string16 ToLowerASCII(StringPiece16 str); + +// Converts the given string to it's ASCII-uppercase equivalent. +BASE_EXPORT std::string ToUpperASCII(StringPiece str); +BASE_EXPORT string16 ToUpperASCII(StringPiece16 str); + +// Functor for case-insensitive ASCII comparisons for STL algorithms like +// std::search. +// +// Note that a full Unicode version of this functor is not possible to write +// because case mappings might change the number of characters, depend on +// context (combining accents), and require handling UTF-16. If you need +// proper Unicode support, use gurl_base::i18n::ToLower/FoldCase and then just +// use a normal operator== on the result. +template<typename Char> struct CaseInsensitiveCompareASCII { + public: + bool operator()(Char x, Char y) const { + return ToLowerASCII(x) == ToLowerASCII(y); + } +}; + +// Like strcasecmp for case-insensitive ASCII characters only. Returns: +// -1 (a < b) +// 0 (a == b) +// 1 (a > b) +// (unlike strcasecmp which can return values greater or less than 1/-1). For +// full Unicode support, use gurl_base::i18n::ToLower or gurl_base::i18h::FoldCase +// and then just call the normal string operators on the result. +BASE_EXPORT int CompareCaseInsensitiveASCII(StringPiece a, StringPiece b); +BASE_EXPORT int CompareCaseInsensitiveASCII(StringPiece16 a, StringPiece16 b); + +// Equality for ASCII case-insensitive comparisons. For full Unicode support, +// use gurl_base::i18n::ToLower or gurl_base::i18h::FoldCase and then compare with either +// == or !=. +BASE_EXPORT bool EqualsCaseInsensitiveASCII(StringPiece a, StringPiece b); +BASE_EXPORT bool EqualsCaseInsensitiveASCII(StringPiece16 a, StringPiece16 b); + +// These threadsafe functions return references to globally unique empty +// strings. +// +// It is likely faster to construct a new empty string object (just a few +// instructions to set the length to 0) than to get the empty string instance +// returned by these functions (which requires threadsafe static access). +// +// Therefore, DO NOT USE THESE AS A GENERAL-PURPOSE SUBSTITUTE FOR DEFAULT +// CONSTRUCTORS. There is only one case where you should use these: functions +// which need to return a string by reference (e.g. as a class member +// accessor), and don't have an empty string to use (e.g. in an error case). +// These should not be used as initializers, function arguments, or return +// values for functions which return by value or outparam. +BASE_EXPORT const std::string& EmptyString(); +BASE_EXPORT const string16& EmptyString16(); + +// Contains the set of characters representing whitespace in the corresponding +// encoding. Null-terminated. The ASCII versions are the whitespaces as defined +// by HTML5, and don't include control characters. +BASE_EXPORT extern const wchar_t kWhitespaceWide[]; // Includes Unicode. +BASE_EXPORT extern const char16 kWhitespaceUTF16[]; // Includes Unicode. +BASE_EXPORT extern const char kWhitespaceASCII[]; +BASE_EXPORT extern const char16 kWhitespaceASCIIAs16[]; // No unicode. + +// Null-terminated string representing the UTF-8 byte order mark. +BASE_EXPORT extern const char kUtf8ByteOrderMark[]; + +// Removes characters in |remove_chars| from anywhere in |input|. Returns true +// if any characters were removed. |remove_chars| must be null-terminated. +// NOTE: Safe to use the same variable for both |input| and |output|. +BASE_EXPORT bool RemoveChars(const string16& input, + StringPiece16 remove_chars, + string16* output); +BASE_EXPORT bool RemoveChars(const std::string& input, + StringPiece remove_chars, + std::string* output); + +// Replaces characters in |replace_chars| from anywhere in |input| with +// |replace_with|. Each character in |replace_chars| will be replaced with +// the |replace_with| string. Returns true if any characters were replaced. +// |replace_chars| must be null-terminated. +// NOTE: Safe to use the same variable for both |input| and |output|. +BASE_EXPORT bool ReplaceChars(const string16& input, + StringPiece16 replace_chars, + const string16& replace_with, + string16* output); +BASE_EXPORT bool ReplaceChars(const std::string& input, + StringPiece replace_chars, + const std::string& replace_with, + std::string* output); + +enum TrimPositions { + TRIM_NONE = 0, + TRIM_LEADING = 1 << 0, + TRIM_TRAILING = 1 << 1, + TRIM_ALL = TRIM_LEADING | TRIM_TRAILING, +}; + +// Removes characters in |trim_chars| from the beginning and end of |input|. +// The 8-bit version only works on 8-bit characters, not UTF-8. Returns true if +// any characters were removed. +// +// It is safe to use the same variable for both |input| and |output| (this is +// the normal usage to trim in-place). +BASE_EXPORT bool TrimString(const string16& input, + StringPiece16 trim_chars, + string16* output); +BASE_EXPORT bool TrimString(const std::string& input, + StringPiece trim_chars, + std::string* output); + +// StringPiece versions of the above. The returned pieces refer to the original +// buffer. +BASE_EXPORT StringPiece16 TrimString(StringPiece16 input, + StringPiece16 trim_chars, + TrimPositions positions); +BASE_EXPORT StringPiece TrimString(StringPiece input, + StringPiece trim_chars, + TrimPositions positions); + +// Truncates a string to the nearest UTF-8 character that will leave +// the string less than or equal to the specified byte size. +BASE_EXPORT void TruncateUTF8ToByteSize(const std::string& input, + const size_t byte_size, + std::string* output); + +#if defined(WCHAR_T_IS_UTF16) +// Utility functions to access the underlying string buffer as a wide char +// pointer. +// +// Note: These functions violate strict aliasing when char16 and wchar_t are +// unrelated types. We thus pass -fno-strict-aliasing to the compiler on +// non-Windows platforms [1], and rely on it being off in Clang's CL mode [2]. +// +// [1] https://crrev.com/b9a0976622/build/config/compiler/BUILD.gn#244 +// [2] +// https://github.com/llvm/llvm-project/blob/1e28a66/clang/lib/Driver/ToolChains/Clang.cpp#L3949 +inline wchar_t* as_writable_wcstr(char16* str) { + return reinterpret_cast<wchar_t*>(str); +} + +inline wchar_t* as_writable_wcstr(string16& str) { + return reinterpret_cast<wchar_t*>(data(str)); +} + +inline const wchar_t* as_wcstr(const char16* str) { + return reinterpret_cast<const wchar_t*>(str); +} + +inline const wchar_t* as_wcstr(StringPiece16 str) { + return reinterpret_cast<const wchar_t*>(str.data()); +} + +// Utility functions to access the underlying string buffer as a char16 pointer. +inline char16* as_writable_u16cstr(wchar_t* str) { + return reinterpret_cast<char16*>(str); +} + +inline char16* as_writable_u16cstr(std::wstring& str) { + return reinterpret_cast<char16*>(data(str)); +} + +inline const char16* as_u16cstr(const wchar_t* str) { + return reinterpret_cast<const char16*>(str); +} + +inline const char16* as_u16cstr(WStringPiece str) { + return reinterpret_cast<const char16*>(str.data()); +} +#endif // defined(WCHAR_T_IS_UTF16) + +// Trims any whitespace from either end of the input string. +// +// The StringPiece versions return a substring referencing the input buffer. +// The ASCII versions look only for ASCII whitespace. +// +// The std::string versions return where whitespace was found. +// NOTE: Safe to use the same variable for both input and output. +BASE_EXPORT TrimPositions TrimWhitespace(const string16& input, + TrimPositions positions, + string16* output); +BASE_EXPORT StringPiece16 TrimWhitespace(StringPiece16 input, + TrimPositions positions); +BASE_EXPORT TrimPositions TrimWhitespaceASCII(const std::string& input, + TrimPositions positions, + std::string* output); +BASE_EXPORT StringPiece TrimWhitespaceASCII(StringPiece input, + TrimPositions positions); + +// Searches for CR or LF characters. Removes all contiguous whitespace +// strings that contain them. This is useful when trying to deal with text +// copied from terminals. +// Returns |text|, with the following three transformations: +// (1) Leading and trailing whitespace is trimmed. +// (2) If |trim_sequences_with_line_breaks| is true, any other whitespace +// sequences containing a CR or LF are trimmed. +// (3) All other whitespace sequences are converted to single spaces. +BASE_EXPORT string16 CollapseWhitespace( + const string16& text, + bool trim_sequences_with_line_breaks); +BASE_EXPORT std::string CollapseWhitespaceASCII( + const std::string& text, + bool trim_sequences_with_line_breaks); + +// Returns true if |input| is empty or contains only characters found in +// |characters|. +BASE_EXPORT bool ContainsOnlyChars(StringPiece input, StringPiece characters); +BASE_EXPORT bool ContainsOnlyChars(StringPiece16 input, + StringPiece16 characters); + +// Returns true if the specified string matches the criteria. How can a wide +// string be 8-bit or UTF8? It contains only characters that are < 256 (in the +// first case) or characters that use only 8-bits and whose 8-bit +// representation looks like a UTF-8 string (the second case). +// +// Note that IsStringUTF8 checks not only if the input is structurally +// valid but also if it doesn't contain any non-character codepoint +// (e.g. U+FFFE). It's done on purpose because all the existing callers want +// to have the maximum 'discriminating' power from other encodings. If +// there's a use case for just checking the structural validity, we have to +// add a new function for that. +// +// IsStringASCII assumes the input is likely all ASCII, and does not leave early +// if it is not the case. +BASE_EXPORT bool IsStringUTF8(StringPiece str); +BASE_EXPORT bool IsStringASCII(StringPiece str); +BASE_EXPORT bool IsStringASCII(StringPiece16 str); +#if defined(WCHAR_T_IS_UTF32) +BASE_EXPORT bool IsStringASCII(WStringPiece str); +#endif + +// Compare the lower-case form of the given string against the given +// previously-lower-cased ASCII string (typically a constant). +BASE_EXPORT bool LowerCaseEqualsASCII(StringPiece str, + StringPiece lowecase_ascii); +BASE_EXPORT bool LowerCaseEqualsASCII(StringPiece16 str, + StringPiece lowecase_ascii); + +// Performs a case-sensitive string compare of the given 16-bit string against +// the given 8-bit ASCII string (typically a constant). The behavior is +// undefined if the |ascii| string is not ASCII. +BASE_EXPORT bool EqualsASCII(StringPiece16 str, StringPiece ascii); + +// Indicates case sensitivity of comparisons. Only ASCII case insensitivity +// is supported. Full Unicode case-insensitive conversions would need to go in +// base/i18n so it can use ICU. +// +// If you need to do Unicode-aware case-insensitive StartsWith/EndsWith, it's +// best to call gurl_base::i18n::ToLower() or gurl_base::i18n::FoldCase() (see +// base/i18n/case_conversion.h for usage advice) on the arguments, and then use +// the results to a case-sensitive comparison. +enum class CompareCase { + SENSITIVE, + INSENSITIVE_ASCII, +}; + +BASE_EXPORT bool StartsWith(StringPiece str, + StringPiece search_for, + CompareCase case_sensitivity); +BASE_EXPORT bool StartsWith(StringPiece16 str, + StringPiece16 search_for, + CompareCase case_sensitivity); +BASE_EXPORT bool EndsWith(StringPiece str, + StringPiece search_for, + CompareCase case_sensitivity); +BASE_EXPORT bool EndsWith(StringPiece16 str, + StringPiece16 search_for, + CompareCase case_sensitivity); + +// Determines the type of ASCII character, independent of locale (the C +// library versions will change based on locale). +template <typename Char> +inline bool IsAsciiWhitespace(Char c) { + return c == ' ' || c == '\r' || c == '\n' || c == '\t' || c == '\f'; +} +template <typename Char> +inline bool IsAsciiAlpha(Char c) { + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); +} +template <typename Char> +inline bool IsAsciiUpper(Char c) { + return c >= 'A' && c <= 'Z'; +} +template <typename Char> +inline bool IsAsciiLower(Char c) { + return c >= 'a' && c <= 'z'; +} +template <typename Char> +inline bool IsAsciiDigit(Char c) { + return c >= '0' && c <= '9'; +} +template <typename Char> +inline bool IsAsciiPrintable(Char c) { + return c >= ' ' && c <= '~'; +} + +template <typename Char> +inline bool IsHexDigit(Char c) { + return (c >= '0' && c <= '9') || + (c >= 'A' && c <= 'F') || + (c >= 'a' && c <= 'f'); +} + +// Returns the integer corresponding to the given hex character. For example: +// '4' -> 4 +// 'a' -> 10 +// 'B' -> 11 +// Assumes the input is a valid hex character. DCHECKs in debug builds if not. +BASE_EXPORT char HexDigitToInt(wchar_t c); + +// Returns true if it's a Unicode whitespace character. +BASE_EXPORT bool IsUnicodeWhitespace(wchar_t c); + +// Return a byte string in human-readable format with a unit suffix. Not +// appropriate for use in any UI; use of FormatBytes and friends in ui/base is +// highly recommended instead. TODO(avi): Figure out how to get callers to use +// FormatBytes instead; remove this. +BASE_EXPORT string16 FormatBytesUnlocalized(int64_t bytes); + +// Starting at |start_offset| (usually 0), replace the first instance of +// |find_this| with |replace_with|. +BASE_EXPORT void ReplaceFirstSubstringAfterOffset( + gurl_base::string16* str, + size_t start_offset, + StringPiece16 find_this, + StringPiece16 replace_with); +BASE_EXPORT void ReplaceFirstSubstringAfterOffset( + std::string* str, + size_t start_offset, + StringPiece find_this, + StringPiece replace_with); + +// Starting at |start_offset| (usually 0), look through |str| and replace all +// instances of |find_this| with |replace_with|. +// +// This does entire substrings; use std::replace in <algorithm> for single +// characters, for example: +// std::replace(str.begin(), str.end(), 'a', 'b'); +BASE_EXPORT void ReplaceSubstringsAfterOffset( + string16* str, + size_t start_offset, + StringPiece16 find_this, + StringPiece16 replace_with); +BASE_EXPORT void ReplaceSubstringsAfterOffset( + std::string* str, + size_t start_offset, + StringPiece find_this, + StringPiece replace_with); + +// Reserves enough memory in |str| to accommodate |length_with_null| characters, +// sets the size of |str| to |length_with_null - 1| characters, and returns a +// pointer to the underlying contiguous array of characters. This is typically +// used when calling a function that writes results into a character array, but +// the caller wants the data to be managed by a string-like object. It is +// convenient in that is can be used inline in the call, and fast in that it +// avoids copying the results of the call from a char* into a string. +// +// |length_with_null| must be at least 2, since otherwise the underlying string +// would have size 0, and trying to access &((*str)[0]) in that case can result +// in a number of problems. +// +// Internally, this takes linear time because the resize() call 0-fills the +// underlying array for potentially all +// (|length_with_null - 1| * sizeof(string_type::value_type)) bytes. Ideally we +// could avoid this aspect of the resize() call, as we expect the caller to +// immediately write over this memory, but there is no other way to set the size +// of the string, and not doing that will mean people who access |str| rather +// than str.c_str() will get back a string of whatever size |str| had on entry +// to this function (probably 0). +BASE_EXPORT char* WriteInto(std::string* str, size_t length_with_null); +BASE_EXPORT char16* WriteInto(string16* str, size_t length_with_null); + +// Does the opposite of SplitString()/SplitStringPiece(). Joins a vector or list +// of strings into a single string, inserting |separator| (which may be empty) +// in between all elements. +// +// If possible, callers should build a vector of StringPieces and use the +// StringPiece variant, so that they do not create unnecessary copies of +// strings. For example, instead of using SplitString, modifying the vector, +// then using JoinString, use SplitStringPiece followed by JoinString so that no +// copies of those strings are created until the final join operation. +// +// Use StrCat (in base/strings/strcat.h) if you don't need a separator. +BASE_EXPORT std::string JoinString(const std::vector<std::string>& parts, + StringPiece separator); +BASE_EXPORT string16 JoinString(const std::vector<string16>& parts, + StringPiece16 separator); +BASE_EXPORT std::string JoinString(const std::vector<StringPiece>& parts, + StringPiece separator); +BASE_EXPORT string16 JoinString(const std::vector<StringPiece16>& parts, + StringPiece16 separator); +// Explicit initializer_list overloads are required to break ambiguity when used +// with a literal initializer list (otherwise the compiler would not be able to +// decide between the string and StringPiece overloads). +BASE_EXPORT std::string JoinString(std::initializer_list<StringPiece> parts, + StringPiece separator); +BASE_EXPORT string16 JoinString(std::initializer_list<StringPiece16> parts, + StringPiece16 separator); + +// Replace $1-$2-$3..$9 in the format string with values from |subst|. +// Additionally, any number of consecutive '$' characters is replaced by that +// number less one. Eg $$->$, $$$->$$, etc. The offsets parameter here can be +// NULL. This only allows you to use up to nine replacements. +BASE_EXPORT string16 ReplaceStringPlaceholders( + const string16& format_string, + const std::vector<string16>& subst, + std::vector<size_t>* offsets); + +BASE_EXPORT std::string ReplaceStringPlaceholders( + StringPiece format_string, + const std::vector<std::string>& subst, + std::vector<size_t>* offsets); + +// Single-string shortcut for ReplaceStringHolders. |offset| may be NULL. +BASE_EXPORT string16 ReplaceStringPlaceholders(const string16& format_string, + const string16& a, + size_t* offset); + +} // namespace base + +#if defined(OS_WIN) +#include "base/strings/string_util_win.h" +#elif defined(OS_POSIX) || defined(OS_FUCHSIA) +#include "base/strings/string_util_posix.h" +#else +#error Define string operations appropriately for your platform +#endif + +#endif // BASE_STRINGS_STRING_UTIL_H_
diff --git a/base/strings/string_util_constants.cc b/base/strings/string_util_constants.cc new file mode 100644 index 0000000..3ca29b7 --- /dev/null +++ b/base/strings/string_util_constants.cc
@@ -0,0 +1,67 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string_util.h" + +namespace gurl_base { + +#define WHITESPACE_UNICODE \ + 0x0009, /* CHARACTER TABULATION */ \ + 0x000A, /* LINE FEED (LF) */ \ + 0x000B, /* LINE TABULATION */ \ + 0x000C, /* FORM FEED (FF) */ \ + 0x000D, /* CARRIAGE RETURN (CR) */ \ + 0x0020, /* SPACE */ \ + 0x0085, /* NEXT LINE (NEL) */ \ + 0x00A0, /* NO-BREAK SPACE */ \ + 0x1680, /* OGHAM SPACE MARK */ \ + 0x2000, /* EN QUAD */ \ + 0x2001, /* EM QUAD */ \ + 0x2002, /* EN SPACE */ \ + 0x2003, /* EM SPACE */ \ + 0x2004, /* THREE-PER-EM SPACE */ \ + 0x2005, /* FOUR-PER-EM SPACE */ \ + 0x2006, /* SIX-PER-EM SPACE */ \ + 0x2007, /* FIGURE SPACE */ \ + 0x2008, /* PUNCTUATION SPACE */ \ + 0x2009, /* THIN SPACE */ \ + 0x200A, /* HAIR SPACE */ \ + 0x2028, /* LINE SEPARATOR */ \ + 0x2029, /* PARAGRAPH SEPARATOR */ \ + 0x202F, /* NARROW NO-BREAK SPACE */ \ + 0x205F, /* MEDIUM MATHEMATICAL SPACE */ \ + 0x3000, /* IDEOGRAPHIC SPACE */ \ + 0 + +const wchar_t kWhitespaceWide[] = { + WHITESPACE_UNICODE +}; + +const char16 kWhitespaceUTF16[] = { + WHITESPACE_UNICODE +}; + +const char kWhitespaceASCII[] = { + 0x09, // CHARACTER TABULATION + 0x0A, // LINE FEED (LF) + 0x0B, // LINE TABULATION + 0x0C, // FORM FEED (FF) + 0x0D, // CARRIAGE RETURN (CR) + 0x20, // SPACE + 0 +}; + +const char16 kWhitespaceASCIIAs16[] = { + 0x09, // CHARACTER TABULATION + 0x0A, // LINE FEED (LF) + 0x0B, // LINE TABULATION + 0x0C, // FORM FEED (FF) + 0x0D, // CARRIAGE RETURN (CR) + 0x20, // SPACE + 0 +}; + +const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF"; + +} // namespace base
diff --git a/base/strings/string_util_perftest.cc b/base/strings/string_util_perftest.cc new file mode 100644 index 0000000..033df0e --- /dev/null +++ b/base/strings/string_util_perftest.cc
@@ -0,0 +1,46 @@ +// Copyright 2018 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string_util.h" + +#include <cinttypes> + +#include "base/time/time.h" +#include "build/build_config.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace gurl_base { + +template <typename String> +void MeasureIsStringASCII(size_t str_length, size_t non_ascii_pos) { + String str(str_length, 'A'); + if (non_ascii_pos < str_length) + str[non_ascii_pos] = '\xAF'; + + TimeTicks t0 = TimeTicks::Now(); + for (size_t i = 0; i < 10000000; ++i) + IsStringASCII(str); + TimeDelta time = TimeTicks::Now() - t0; + printf( + "char-size:\t%zu\tlength:\t%zu\tnon-ascii-pos:\t%zu\ttime-ms:\t%" PRIu64 + "\n", + sizeof(typename String::value_type), str_length, non_ascii_pos, + time.InMilliseconds()); +} + +TEST(StringUtilTest, DISABLED_IsStringASCIIPerf) { + for (size_t str_length = 4; str_length <= 1024; str_length *= 2) { + for (size_t non_ascii_loc = 0; non_ascii_loc < 3; ++non_ascii_loc) { + size_t non_ascii_pos = str_length * non_ascii_loc / 2 + 2; + MeasureIsStringASCII<std::string>(str_length, non_ascii_pos); + MeasureIsStringASCII<string16>(str_length, non_ascii_pos); +#if defined(WCHAR_T_IS_UTF32) + MeasureIsStringASCII<std::basic_string<wchar_t>>(str_length, + non_ascii_pos); +#endif + } + } +} + +} // namespace base
diff --git a/base/strings/string_util_posix.h b/base/strings/string_util_posix.h new file mode 100644 index 0000000..e1ba7c3 --- /dev/null +++ b/base/strings/string_util_posix.h
@@ -0,0 +1,37 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_STRING_UTIL_POSIX_H_ +#define BASE_STRINGS_STRING_UTIL_POSIX_H_ + +#include <stdarg.h> +#include <stddef.h> +#include <stdio.h> +#include <string.h> +#include <wchar.h> + +#include "polyfills/base/logging.h" + +namespace gurl_base { + +// Chromium code style is to not use malloc'd strings; this is only for use +// for interaction with APIs that require it. +inline char* strdup(const char* str) { + return ::strdup(str); +} + +inline int vsnprintf(char* buffer, size_t size, + const char* format, va_list arguments) { + return ::vsnprintf(buffer, size, format, arguments); +} + +inline int vswprintf(wchar_t* buffer, size_t size, + const wchar_t* format, va_list arguments) { + GURL_DCHECK(IsWprintfFormatPortable(format)); + return ::vswprintf(buffer, size, format, arguments); +} + +} // namespace base + +#endif // BASE_STRINGS_STRING_UTIL_POSIX_H_
diff --git a/base/strings/string_util_unittest.cc b/base/strings/string_util_unittest.cc new file mode 100644 index 0000000..51b4ee1 --- /dev/null +++ b/base/strings/string_util_unittest.cc
@@ -0,0 +1,1430 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string_util.h" + +#include <math.h> +#include <stdarg.h> +#include <stddef.h> +#include <stdint.h> + +#include <algorithm> +#include <type_traits> + +#include "base/stl_util.h" +#include "base/strings/string16.h" +#include "base/strings/utf_string_conversions.h" +#include "build/build_config.h" +#include "testing/gmock/include/gmock/gmock.h" +#include "testing/gtest/include/gtest/gtest.h" + +using ::testing::ElementsAre; + +namespace gurl_base { + +static const struct trim_case { + const wchar_t* input; + const TrimPositions positions; + const wchar_t* output; + const TrimPositions return_value; +} trim_cases[] = { + {L" Google Video ", TRIM_LEADING, L"Google Video ", TRIM_LEADING}, + {L" Google Video ", TRIM_TRAILING, L" Google Video", TRIM_TRAILING}, + {L" Google Video ", TRIM_ALL, L"Google Video", TRIM_ALL}, + {L"Google Video", TRIM_ALL, L"Google Video", TRIM_NONE}, + {L"", TRIM_ALL, L"", TRIM_NONE}, + {L" ", TRIM_LEADING, L"", TRIM_LEADING}, + {L" ", TRIM_TRAILING, L"", TRIM_TRAILING}, + {L" ", TRIM_ALL, L"", TRIM_ALL}, + {L"\t\rTest String\n", TRIM_ALL, L"Test String", TRIM_ALL}, + {L"\x2002Test String\x00A0\x3000", TRIM_ALL, L"Test String", TRIM_ALL}, +}; + +static const struct trim_case_ascii { + const char* input; + const TrimPositions positions; + const char* output; + const TrimPositions return_value; +} trim_cases_ascii[] = { + {" Google Video ", TRIM_LEADING, "Google Video ", TRIM_LEADING}, + {" Google Video ", TRIM_TRAILING, " Google Video", TRIM_TRAILING}, + {" Google Video ", TRIM_ALL, "Google Video", TRIM_ALL}, + {"Google Video", TRIM_ALL, "Google Video", TRIM_NONE}, + {"", TRIM_ALL, "", TRIM_NONE}, + {" ", TRIM_LEADING, "", TRIM_LEADING}, + {" ", TRIM_TRAILING, "", TRIM_TRAILING}, + {" ", TRIM_ALL, "", TRIM_ALL}, + {"\t\rTest String\n", TRIM_ALL, "Test String", TRIM_ALL}, +}; + +namespace { + +// Helper used to test TruncateUTF8ToByteSize. +bool Truncated(const std::string& input, + const size_t byte_size, + std::string* output) { + size_t prev = input.length(); + TruncateUTF8ToByteSize(input, byte_size, output); + return prev != output->length(); +} + +} // namespace + +TEST(StringUtilTest, TruncateUTF8ToByteSize) { + std::string output; + + // Empty strings and invalid byte_size arguments + EXPECT_FALSE(Truncated(std::string(), 0, &output)); + EXPECT_EQ(output, ""); + EXPECT_TRUE(Truncated("\xe1\x80\xbf", 0, &output)); + EXPECT_EQ(output, ""); + EXPECT_FALSE(Truncated("\xe1\x80\xbf", static_cast<size_t>(-1), &output)); + EXPECT_FALSE(Truncated("\xe1\x80\xbf", 4, &output)); + + // Testing the truncation of valid UTF8 correctly + EXPECT_TRUE(Truncated("abc", 2, &output)); + EXPECT_EQ(output, "ab"); + EXPECT_TRUE(Truncated("\xc2\x81\xc2\x81", 2, &output)); + EXPECT_EQ(output.compare("\xc2\x81"), 0); + EXPECT_TRUE(Truncated("\xc2\x81\xc2\x81", 3, &output)); + EXPECT_EQ(output.compare("\xc2\x81"), 0); + EXPECT_FALSE(Truncated("\xc2\x81\xc2\x81", 4, &output)); + EXPECT_EQ(output.compare("\xc2\x81\xc2\x81"), 0); + + { + const char array[] = "\x00\x00\xc2\x81\xc2\x81"; + const std::string array_string(array, gurl_base::size(array)); + EXPECT_TRUE(Truncated(array_string, 4, &output)); + EXPECT_EQ(output.compare(std::string("\x00\x00\xc2\x81", 4)), 0); + } + + { + const char array[] = "\x00\xc2\x81\xc2\x81"; + const std::string array_string(array, gurl_base::size(array)); + EXPECT_TRUE(Truncated(array_string, 4, &output)); + EXPECT_EQ(output.compare(std::string("\x00\xc2\x81", 3)), 0); + } + + // Testing invalid UTF8 + EXPECT_TRUE(Truncated("\xed\xa0\x80\xed\xbf\xbf", 6, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xed\xa0\x8f", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xed\xbf\xbf", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + + // Testing invalid UTF8 mixed with valid UTF8 + EXPECT_FALSE(Truncated("\xe1\x80\xbf", 3, &output)); + EXPECT_EQ(output.compare("\xe1\x80\xbf"), 0); + EXPECT_FALSE(Truncated("\xf1\x80\xa0\xbf", 4, &output)); + EXPECT_EQ(output.compare("\xf1\x80\xa0\xbf"), 0); + EXPECT_FALSE(Truncated("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf", + 10, &output)); + EXPECT_EQ(output.compare("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf"), 0); + EXPECT_TRUE(Truncated("a\xc2\x81\xe1\x80\xbf\xf1""a""\x80\xa0", + 10, &output)); + EXPECT_EQ(output.compare("a\xc2\x81\xe1\x80\xbf\xf1""a"), 0); + EXPECT_FALSE(Truncated("\xef\xbb\xbf" "abc", 6, &output)); + EXPECT_EQ(output.compare("\xef\xbb\xbf" "abc"), 0); + + // Overlong sequences + EXPECT_TRUE(Truncated("\xc0\x80", 2, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xc1\x80\xc1\x81", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xe0\x80\x80", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xe0\x82\x80", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xe0\x9f\xbf", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf0\x80\x80\x8D", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf0\x80\x82\x91", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf0\x80\xa0\x80", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf0\x8f\xbb\xbf", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf8\x80\x80\x80\xbf", 5, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xfc\x80\x80\x80\xa0\xa5", 6, &output)); + EXPECT_EQ(output.compare(""), 0); + + // Beyond U+10FFFF (the upper limit of Unicode codespace) + EXPECT_TRUE(Truncated("\xf4\x90\x80\x80", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf8\xa0\xbf\x80\xbf", 5, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xfc\x9c\xbf\x80\xbf\x80", 6, &output)); + EXPECT_EQ(output.compare(""), 0); + + // BOMs in UTF-16(BE|LE) and UTF-32(BE|LE) + EXPECT_TRUE(Truncated("\xfe\xff", 2, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xff\xfe", 2, &output)); + EXPECT_EQ(output.compare(""), 0); + + { + const char array[] = "\x00\x00\xfe\xff"; + const std::string array_string(array, gurl_base::size(array)); + EXPECT_TRUE(Truncated(array_string, 4, &output)); + EXPECT_EQ(output.compare(std::string("\x00\x00", 2)), 0); + } + + // Variants on the previous test + { + const char array[] = "\xff\xfe\x00\x00"; + const std::string array_string(array, 4); + EXPECT_FALSE(Truncated(array_string, 4, &output)); + EXPECT_EQ(output.compare(std::string("\xff\xfe\x00\x00", 4)), 0); + } + { + const char array[] = "\xff\x00\x00\xfe"; + const std::string array_string(array, gurl_base::size(array)); + EXPECT_TRUE(Truncated(array_string, 4, &output)); + EXPECT_EQ(output.compare(std::string("\xff\x00\x00", 3)), 0); + } + + // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF> + EXPECT_TRUE(Truncated("\xef\xbf\xbe", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf0\x8f\xbf\xbe", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf3\xbf\xbf\xbf", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xef\xb7\x90", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xef\xb7\xaf", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + + // Strings in legacy encodings that are valid in UTF-8, but + // are invalid as UTF-8 in real data. + EXPECT_TRUE(Truncated("caf\xe9", 4, &output)); + EXPECT_EQ(output.compare("caf"), 0); + EXPECT_TRUE(Truncated("\xb0\xa1\xb0\xa2", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_FALSE(Truncated("\xa7\x41\xa6\x6e", 4, &output)); + EXPECT_EQ(output.compare("\xa7\x41\xa6\x6e"), 0); + EXPECT_TRUE(Truncated("\xa7\x41\xa6\x6e\xd9\xee\xe4\xee", 7, + &output)); + EXPECT_EQ(output.compare("\xa7\x41\xa6\x6e"), 0); + + // Testing using the same string as input and output. + EXPECT_FALSE(Truncated(output, 4, &output)); + EXPECT_EQ(output.compare("\xa7\x41\xa6\x6e"), 0); + EXPECT_TRUE(Truncated(output, 3, &output)); + EXPECT_EQ(output.compare("\xa7\x41"), 0); + + // "abc" with U+201[CD] in windows-125[0-8] + EXPECT_TRUE(Truncated("\x93" "abc\x94", 5, &output)); + EXPECT_EQ(output.compare("\x93" "abc"), 0); + + // U+0639 U+064E U+0644 U+064E in ISO-8859-6 + EXPECT_TRUE(Truncated("\xd9\xee\xe4\xee", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + + // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7 + EXPECT_TRUE(Truncated("\xe3\xe5\xe9\xdC", 4, &output)); + EXPECT_EQ(output.compare(""), 0); +} + +#if defined(WCHAR_T_IS_UTF16) +TEST(StringUtilTest, as_wcstr) { + char16 rw_buffer[10] = {}; + static_assert( + std::is_same<wchar_t*, decltype(as_writable_wcstr(rw_buffer))>::value, + ""); + EXPECT_EQ(static_cast<void*>(rw_buffer), as_writable_wcstr(rw_buffer)); + + string16 rw_str(10, '\0'); + static_assert( + std::is_same<wchar_t*, decltype(as_writable_wcstr(rw_str))>::value, ""); + EXPECT_EQ(static_cast<const void*>(rw_str.data()), as_writable_wcstr(rw_str)); + + const char16 ro_buffer[10] = {}; + static_assert( + std::is_same<const wchar_t*, decltype(as_wcstr(ro_buffer))>::value, ""); + EXPECT_EQ(static_cast<const void*>(ro_buffer), as_wcstr(ro_buffer)); + + const string16 ro_str(10, '\0'); + static_assert(std::is_same<const wchar_t*, decltype(as_wcstr(ro_str))>::value, + ""); + EXPECT_EQ(static_cast<const void*>(ro_str.data()), as_wcstr(ro_str)); + + StringPiece16 piece = ro_buffer; + static_assert(std::is_same<const wchar_t*, decltype(as_wcstr(piece))>::value, + ""); + EXPECT_EQ(static_cast<const void*>(piece.data()), as_wcstr(piece)); +} + +TEST(StringUtilTest, as_u16cstr) { + wchar_t rw_buffer[10] = {}; + static_assert( + std::is_same<char16*, decltype(as_writable_u16cstr(rw_buffer))>::value, + ""); + EXPECT_EQ(static_cast<void*>(rw_buffer), as_writable_u16cstr(rw_buffer)); + + std::wstring rw_str(10, '\0'); + static_assert( + std::is_same<char16*, decltype(as_writable_u16cstr(rw_str))>::value, ""); + EXPECT_EQ(static_cast<const void*>(rw_str.data()), + as_writable_u16cstr(rw_str)); + + const wchar_t ro_buffer[10] = {}; + static_assert( + std::is_same<const char16*, decltype(as_u16cstr(ro_buffer))>::value, ""); + EXPECT_EQ(static_cast<const void*>(ro_buffer), as_u16cstr(ro_buffer)); + + const std::wstring ro_str(10, '\0'); + static_assert( + std::is_same<const char16*, decltype(as_u16cstr(ro_str))>::value, ""); + EXPECT_EQ(static_cast<const void*>(ro_str.data()), as_u16cstr(ro_str)); + + WStringPiece piece = ro_buffer; + static_assert(std::is_same<const char16*, decltype(as_u16cstr(piece))>::value, + ""); + EXPECT_EQ(static_cast<const void*>(piece.data()), as_u16cstr(piece)); +} +#endif // defined(WCHAR_T_IS_UTF16) + +TEST(StringUtilTest, TrimWhitespace) { + string16 output; // Allow contents to carry over to next testcase + for (const auto& value : trim_cases) { + EXPECT_EQ(value.return_value, + TrimWhitespace(WideToUTF16(value.input), value.positions, + &output)); + EXPECT_EQ(WideToUTF16(value.output), output); + } + + // Test that TrimWhitespace() can take the same string for input and output + output = ASCIIToUTF16(" This is a test \r\n"); + EXPECT_EQ(TRIM_ALL, TrimWhitespace(output, TRIM_ALL, &output)); + EXPECT_EQ(ASCIIToUTF16("This is a test"), output); + + // Once more, but with a string of whitespace + output = ASCIIToUTF16(" \r\n"); + EXPECT_EQ(TRIM_ALL, TrimWhitespace(output, TRIM_ALL, &output)); + EXPECT_EQ(string16(), output); + + std::string output_ascii; + for (const auto& value : trim_cases_ascii) { + EXPECT_EQ(value.return_value, + TrimWhitespaceASCII(value.input, value.positions, &output_ascii)); + EXPECT_EQ(value.output, output_ascii); + } +} + +static const struct collapse_case { + const wchar_t* input; + const bool trim; + const wchar_t* output; +} collapse_cases[] = { + {L" Google Video ", false, L"Google Video"}, + {L"Google Video", false, L"Google Video"}, + {L"", false, L""}, + {L" ", false, L""}, + {L"\t\rTest String\n", false, L"Test String"}, + {L"\x2002Test String\x00A0\x3000", false, L"Test String"}, + {L" Test \n \t String ", false, L"Test String"}, + {L"\x2002Test\x1680 \x2028 \tString\x00A0\x3000", false, L"Test String"}, + {L" Test String", false, L"Test String"}, + {L"Test String ", false, L"Test String"}, + {L"Test String", false, L"Test String"}, + {L"", true, L""}, + {L"\n", true, L""}, + {L" \r ", true, L""}, + {L"\nFoo", true, L"Foo"}, + {L"\r Foo ", true, L"Foo"}, + {L" Foo bar ", true, L"Foo bar"}, + {L" \tFoo bar \n", true, L"Foo bar"}, + {L" a \r b\n c \r\n d \t\re \t f \n ", true, L"abcde f"}, +}; + +TEST(StringUtilTest, CollapseWhitespace) { + for (const auto& value : collapse_cases) { + EXPECT_EQ(WideToUTF16(value.output), + CollapseWhitespace(WideToUTF16(value.input), value.trim)); + } +} + +static const struct collapse_case_ascii { + const char* input; + const bool trim; + const char* output; +} collapse_cases_ascii[] = { + {" Google Video ", false, "Google Video"}, + {"Google Video", false, "Google Video"}, + {"", false, ""}, + {" ", false, ""}, + {"\t\rTest String\n", false, "Test String"}, + {" Test \n \t String ", false, "Test String"}, + {" Test String", false, "Test String"}, + {"Test String ", false, "Test String"}, + {"Test String", false, "Test String"}, + {"", true, ""}, + {"\n", true, ""}, + {" \r ", true, ""}, + {"\nFoo", true, "Foo"}, + {"\r Foo ", true, "Foo"}, + {" Foo bar ", true, "Foo bar"}, + {" \tFoo bar \n", true, "Foo bar"}, + {" a \r b\n c \r\n d \t\re \t f \n ", true, "abcde f"}, +}; + +TEST(StringUtilTest, CollapseWhitespaceASCII) { + for (const auto& value : collapse_cases_ascii) { + EXPECT_EQ(value.output, CollapseWhitespaceASCII(value.input, value.trim)); + } +} + +TEST(StringUtilTest, IsStringUTF8) { + EXPECT_TRUE(IsStringUTF8("abc")); + EXPECT_TRUE(IsStringUTF8("\xc2\x81")); + EXPECT_TRUE(IsStringUTF8("\xe1\x80\xbf")); + EXPECT_TRUE(IsStringUTF8("\xf1\x80\xa0\xbf")); + EXPECT_TRUE(IsStringUTF8("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf")); + EXPECT_TRUE(IsStringUTF8("\xef\xbb\xbf" "abc")); // UTF-8 BOM + + // surrogate code points + EXPECT_FALSE(IsStringUTF8("\xed\xa0\x80\xed\xbf\xbf")); + EXPECT_FALSE(IsStringUTF8("\xed\xa0\x8f")); + EXPECT_FALSE(IsStringUTF8("\xed\xbf\xbf")); + + // overlong sequences + EXPECT_FALSE(IsStringUTF8("\xc0\x80")); // U+0000 + EXPECT_FALSE(IsStringUTF8("\xc1\x80\xc1\x81")); // "AB" + EXPECT_FALSE(IsStringUTF8("\xe0\x80\x80")); // U+0000 + EXPECT_FALSE(IsStringUTF8("\xe0\x82\x80")); // U+0080 + EXPECT_FALSE(IsStringUTF8("\xe0\x9f\xbf")); // U+07ff + EXPECT_FALSE(IsStringUTF8("\xf0\x80\x80\x8D")); // U+000D + EXPECT_FALSE(IsStringUTF8("\xf0\x80\x82\x91")); // U+0091 + EXPECT_FALSE(IsStringUTF8("\xf0\x80\xa0\x80")); // U+0800 + EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbb\xbf")); // U+FEFF (BOM) + EXPECT_FALSE(IsStringUTF8("\xf8\x80\x80\x80\xbf")); // U+003F + EXPECT_FALSE(IsStringUTF8("\xfc\x80\x80\x80\xa0\xa5")); // U+00A5 + + // Beyond U+10FFFF (the upper limit of Unicode codespace) + EXPECT_FALSE(IsStringUTF8("\xf4\x90\x80\x80")); // U+110000 + EXPECT_FALSE(IsStringUTF8("\xf8\xa0\xbf\x80\xbf")); // 5 bytes + EXPECT_FALSE(IsStringUTF8("\xfc\x9c\xbf\x80\xbf\x80")); // 6 bytes + + // BOMs in UTF-16(BE|LE) and UTF-32(BE|LE) + EXPECT_FALSE(IsStringUTF8("\xfe\xff")); + EXPECT_FALSE(IsStringUTF8("\xff\xfe")); + EXPECT_FALSE(IsStringUTF8(std::string("\x00\x00\xfe\xff", 4))); + EXPECT_FALSE(IsStringUTF8("\xff\xfe\x00\x00")); + + // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF> + EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe")); // U+FFFE) + EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe")); // U+1FFFE + EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf")); // U+10FFFF + EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90")); // U+FDD0 + EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf")); // U+FDEF + // Strings in legacy encodings. We can certainly make up strings + // in a legacy encoding that are valid in UTF-8, but in real data, + // most of them are invalid as UTF-8. + EXPECT_FALSE(IsStringUTF8("caf\xe9")); // cafe with U+00E9 in ISO-8859-1 + EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2")); // U+AC00, U+AC001 in EUC-KR + EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e")); // U+4F60 U+597D in Big5 + // "abc" with U+201[CD] in windows-125[0-8] + EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94")); + // U+0639 U+064E U+0644 U+064E in ISO-8859-6 + EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee")); + // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7 + EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC")); + + // Check that we support Embedded Nulls. The first uses the canonical UTF-8 + // representation, and the second uses a 2-byte sequence. The second version + // is invalid UTF-8 since UTF-8 states that the shortest encoding for a + // given codepoint must be used. + static const char kEmbeddedNull[] = "embedded\0null"; + EXPECT_TRUE(IsStringUTF8( + std::string(kEmbeddedNull, sizeof(kEmbeddedNull)))); + EXPECT_FALSE(IsStringUTF8("embedded\xc0\x80U+0000")); +} + +TEST(StringUtilTest, IsStringASCII) { + static char char_ascii[] = + "0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF"; + static char16 char16_ascii[] = { + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'A', + 'B', 'C', 'D', 'E', 'F', '0', '1', '2', '3', '4', '5', '6', + '7', '8', '9', '0', 'A', 'B', 'C', 'D', 'E', 'F', 0 }; + static std::wstring wchar_ascii( + L"0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF"); + + // Test a variety of the fragment start positions and lengths in order to make + // sure that bit masking in IsStringASCII works correctly. + // Also, test that a non-ASCII character will be detected regardless of its + // position inside the string. + { + const size_t string_length = gurl_base::size(char_ascii) - 1; + for (size_t offset = 0; offset < 8; ++offset) { + for (size_t len = 0, max_len = string_length - offset; len < max_len; + ++len) { + EXPECT_TRUE(IsStringASCII(StringPiece(char_ascii + offset, len))); + for (size_t char_pos = offset; char_pos < len; ++char_pos) { + char_ascii[char_pos] |= '\x80'; + EXPECT_FALSE(IsStringASCII(StringPiece(char_ascii + offset, len))); + char_ascii[char_pos] &= ~'\x80'; + } + } + } + } + + { + const size_t string_length = gurl_base::size(char16_ascii) - 1; + for (size_t offset = 0; offset < 4; ++offset) { + for (size_t len = 0, max_len = string_length - offset; len < max_len; + ++len) { + EXPECT_TRUE(IsStringASCII(StringPiece16(char16_ascii + offset, len))); + for (size_t char_pos = offset; char_pos < len; ++char_pos) { + char16_ascii[char_pos] |= 0x80; + EXPECT_FALSE( + IsStringASCII(StringPiece16(char16_ascii + offset, len))); + char16_ascii[char_pos] &= ~0x80; + // Also test when the upper half is non-zero. + char16_ascii[char_pos] |= 0x100; + EXPECT_FALSE( + IsStringASCII(StringPiece16(char16_ascii + offset, len))); + char16_ascii[char_pos] &= ~0x100; + } + } + } + } + +#if defined(WCHAR_T_IS_UTF32) + { + const size_t string_length = wchar_ascii.length(); + for (size_t len = 0; len < string_length; ++len) { + EXPECT_TRUE(IsStringASCII(wchar_ascii.substr(0, len))); + for (size_t char_pos = 0; char_pos < len; ++char_pos) { + wchar_ascii[char_pos] |= 0x80; + EXPECT_FALSE(IsStringASCII(wchar_ascii.substr(0, len))); + wchar_ascii[char_pos] &= ~0x80; + wchar_ascii[char_pos] |= 0x100; + EXPECT_FALSE(IsStringASCII(wchar_ascii.substr(0, len))); + wchar_ascii[char_pos] &= ~0x100; + wchar_ascii[char_pos] |= 0x10000; + EXPECT_FALSE(IsStringASCII(wchar_ascii.substr(0, len))); + wchar_ascii[char_pos] &= ~0x10000; + } + } + } +#endif // WCHAR_T_IS_UTF32 +} + +TEST(StringUtilTest, ConvertASCII) { + static const char* const char_cases[] = { + "Google Video", + "Hello, world\n", + "0123ABCDwxyz \a\b\t\r\n!+,.~" + }; + + static const wchar_t* const wchar_cases[] = { + L"Google Video", + L"Hello, world\n", + L"0123ABCDwxyz \a\b\t\r\n!+,.~" + }; + + for (size_t i = 0; i < gurl_base::size(char_cases); ++i) { + EXPECT_TRUE(IsStringASCII(char_cases[i])); + string16 utf16 = ASCIIToUTF16(char_cases[i]); + EXPECT_EQ(WideToUTF16(wchar_cases[i]), utf16); + + std::string ascii = UTF16ToASCII(WideToUTF16(wchar_cases[i])); + EXPECT_EQ(char_cases[i], ascii); + } + + EXPECT_FALSE(IsStringASCII("Google \x80Video")); + + // Convert empty strings. + string16 empty16; + std::string empty; + EXPECT_EQ(empty, UTF16ToASCII(empty16)); + EXPECT_EQ(empty16, ASCIIToUTF16(empty)); + + // Convert strings with an embedded NUL character. + const char chars_with_nul[] = "test\0string"; + const int length_with_nul = gurl_base::size(chars_with_nul) - 1; + std::string string_with_nul(chars_with_nul, length_with_nul); + string16 string16_with_nul = ASCIIToUTF16(string_with_nul); + EXPECT_EQ(static_cast<string16::size_type>(length_with_nul), + string16_with_nul.length()); + std::string narrow_with_nul = UTF16ToASCII(string16_with_nul); + EXPECT_EQ(static_cast<std::string::size_type>(length_with_nul), + narrow_with_nul.length()); + EXPECT_EQ(0, string_with_nul.compare(narrow_with_nul)); +} + +TEST(StringUtilTest, ToLowerASCII) { + EXPECT_EQ('c', ToLowerASCII('C')); + EXPECT_EQ('c', ToLowerASCII('c')); + EXPECT_EQ('2', ToLowerASCII('2')); + + EXPECT_EQ(static_cast<char16>('c'), ToLowerASCII(static_cast<char16>('C'))); + EXPECT_EQ(static_cast<char16>('c'), ToLowerASCII(static_cast<char16>('c'))); + EXPECT_EQ(static_cast<char16>('2'), ToLowerASCII(static_cast<char16>('2'))); + + EXPECT_EQ("cc2", ToLowerASCII("Cc2")); + EXPECT_EQ(ASCIIToUTF16("cc2"), ToLowerASCII(ASCIIToUTF16("Cc2"))); +} + +TEST(StringUtilTest, ToUpperASCII) { + EXPECT_EQ('C', ToUpperASCII('C')); + EXPECT_EQ('C', ToUpperASCII('c')); + EXPECT_EQ('2', ToUpperASCII('2')); + + EXPECT_EQ(static_cast<char16>('C'), ToUpperASCII(static_cast<char16>('C'))); + EXPECT_EQ(static_cast<char16>('C'), ToUpperASCII(static_cast<char16>('c'))); + EXPECT_EQ(static_cast<char16>('2'), ToUpperASCII(static_cast<char16>('2'))); + + EXPECT_EQ("CC2", ToUpperASCII("Cc2")); + EXPECT_EQ(ASCIIToUTF16("CC2"), ToUpperASCII(ASCIIToUTF16("Cc2"))); +} + +TEST(StringUtilTest, LowerCaseEqualsASCII) { + static const struct { + const char* src_a; + const char* dst; + } lowercase_cases[] = { + { "FoO", "foo" }, + { "foo", "foo" }, + { "FOO", "foo" }, + }; + + for (const auto& i : lowercase_cases) { + EXPECT_TRUE(LowerCaseEqualsASCII(ASCIIToUTF16(i.src_a), i.dst)); + EXPECT_TRUE(LowerCaseEqualsASCII(i.src_a, i.dst)); + } +} + +TEST(StringUtilTest, FormatBytesUnlocalized) { + static const struct { + int64_t bytes; + const char* expected; + } cases[] = { + // Expected behavior: we show one post-decimal digit when we have + // under two pre-decimal digits, except in cases where it makes no + // sense (zero or bytes). + // Since we switch units once we cross the 1000 mark, this keeps + // the display of file sizes or bytes consistently around three + // digits. + {0, "0 B"}, + {512, "512 B"}, + {1024*1024, "1.0 MB"}, + {1024*1024*1024, "1.0 GB"}, + {10LL*1024*1024*1024, "10.0 GB"}, + {99LL*1024*1024*1024, "99.0 GB"}, + {105LL*1024*1024*1024, "105 GB"}, + {105LL*1024*1024*1024 + 500LL*1024*1024, "105 GB"}, + {~(1LL << 63), "8192 PB"}, + + {99*1024 + 103, "99.1 kB"}, + {1024*1024 + 103, "1.0 MB"}, + {1024*1024 + 205 * 1024, "1.2 MB"}, + {1024*1024*1024 + (927 * 1024*1024), "1.9 GB"}, + {10LL*1024*1024*1024, "10.0 GB"}, + {100LL*1024*1024*1024, "100 GB"}, + }; + + for (const auto& i : cases) { + EXPECT_EQ(ASCIIToUTF16(i.expected), FormatBytesUnlocalized(i.bytes)); + } +} +TEST(StringUtilTest, ReplaceSubstringsAfterOffset) { + static const struct { + StringPiece str; + size_t start_offset; + StringPiece find_this; + StringPiece replace_with; + StringPiece expected; + } cases[] = { + {"aaa", 0, "", "b", "aaa"}, + {"aaa", 1, "", "b", "aaa"}, + {"aaa", 0, "a", "b", "bbb"}, + {"aaa", 0, "aa", "b", "ba"}, + {"aaa", 0, "aa", "bbb", "bbba"}, + {"aaaaa", 0, "aa", "b", "bba"}, + {"ababaaababa", 0, "aba", "", "baaba"}, + {"ababaaababa", 0, "aba", "_", "_baa_ba"}, + {"ababaaababa", 0, "aba", "__", "__baa__ba"}, + {"ababaaababa", 0, "aba", "___", "___baa___ba"}, + {"ababaaababa", 0, "aba", "____", "____baa____ba"}, + {"ababaaababa", 0, "aba", "_____", "_____baa_____ba"}, + {"abb", 0, "ab", "a", "ab"}, + {"Removing some substrings inging", 0, "ing", "", "Remov some substrs "}, + {"Not found", 0, "x", "0", "Not found"}, + {"Not found again", 5, "x", "0", "Not found again"}, + {" Making it much longer ", 0, " ", "Four score and seven years ago", + "Four score and seven years agoMakingFour score and seven years agoit" + "Four score and seven years agomuchFour score and seven years agolonger" + "Four score and seven years ago"}, + {" Making it much much much much shorter ", 0, + "Making it much much much much shorter", "", " "}, + {"so much much much much much very much much much shorter", 0, "much ", + "", "so very shorter"}, + {"Invalid offset", 9999, "t", "foobar", "Invalid offset"}, + {"Replace me only me once", 9, "me ", "", "Replace me only once"}, + {"abababab", 2, "ab", "c", "abccc"}, + {"abababab", 1, "ab", "c", "abccc"}, + {"abababab", 1, "aba", "c", "abcbab"}, + }; + + // gurl_base::string16 variant + for (const auto& scenario : cases) { + string16 str = ASCIIToUTF16(scenario.str); + ReplaceSubstringsAfterOffset(&str, scenario.start_offset, + ASCIIToUTF16(scenario.find_this), + ASCIIToUTF16(scenario.replace_with)); + EXPECT_EQ(ASCIIToUTF16(scenario.expected), str); + } + + // std::string with insufficient capacity: expansion must realloc the buffer. + for (const auto& scenario : cases) { + std::string str = scenario.str.as_string(); + str.shrink_to_fit(); // This is nonbinding, but it's the best we've got. + ReplaceSubstringsAfterOffset(&str, scenario.start_offset, + scenario.find_this, scenario.replace_with); + EXPECT_EQ(scenario.expected, str); + } + + // std::string with ample capacity: should be possible to grow in-place. + for (const auto& scenario : cases) { + std::string str = scenario.str.as_string(); + str.reserve(std::max(scenario.str.length(), scenario.expected.length()) * + 2); + + ReplaceSubstringsAfterOffset(&str, scenario.start_offset, + scenario.find_this, scenario.replace_with); + EXPECT_EQ(scenario.expected, str); + } +} + +TEST(StringUtilTest, ReplaceFirstSubstringAfterOffset) { + static const struct { + const char* str; + string16::size_type start_offset; + const char* find_this; + const char* replace_with; + const char* expected; + } cases[] = { + {"aaa", 0, "a", "b", "baa"}, + {"abb", 0, "ab", "a", "ab"}, + {"Removing some substrings inging", 0, "ing", "", + "Remov some substrings inging"}, + {"Not found", 0, "x", "0", "Not found"}, + {"Not found again", 5, "x", "0", "Not found again"}, + {" Making it much longer ", 0, " ", "Four score and seven years ago", + "Four score and seven years agoMaking it much longer "}, + {"Invalid offset", 9999, "t", "foobar", "Invalid offset"}, + {"Replace me only me once", 4, "me ", "", "Replace only me once"}, + {"abababab", 2, "ab", "c", "abcabab"}, + }; + + for (const auto& i : cases) { + string16 str = ASCIIToUTF16(i.str); + ReplaceFirstSubstringAfterOffset(&str, i.start_offset, + ASCIIToUTF16(i.find_this), + ASCIIToUTF16(i.replace_with)); + EXPECT_EQ(ASCIIToUTF16(i.expected), str); + } +} + +TEST(StringUtilTest, HexDigitToInt) { + EXPECT_EQ(0, HexDigitToInt('0')); + EXPECT_EQ(1, HexDigitToInt('1')); + EXPECT_EQ(2, HexDigitToInt('2')); + EXPECT_EQ(3, HexDigitToInt('3')); + EXPECT_EQ(4, HexDigitToInt('4')); + EXPECT_EQ(5, HexDigitToInt('5')); + EXPECT_EQ(6, HexDigitToInt('6')); + EXPECT_EQ(7, HexDigitToInt('7')); + EXPECT_EQ(8, HexDigitToInt('8')); + EXPECT_EQ(9, HexDigitToInt('9')); + EXPECT_EQ(10, HexDigitToInt('A')); + EXPECT_EQ(11, HexDigitToInt('B')); + EXPECT_EQ(12, HexDigitToInt('C')); + EXPECT_EQ(13, HexDigitToInt('D')); + EXPECT_EQ(14, HexDigitToInt('E')); + EXPECT_EQ(15, HexDigitToInt('F')); + + // Verify the lower case as well. + EXPECT_EQ(10, HexDigitToInt('a')); + EXPECT_EQ(11, HexDigitToInt('b')); + EXPECT_EQ(12, HexDigitToInt('c')); + EXPECT_EQ(13, HexDigitToInt('d')); + EXPECT_EQ(14, HexDigitToInt('e')); + EXPECT_EQ(15, HexDigitToInt('f')); +} + +TEST(StringUtilTest, JoinString) { + std::string separator(", "); + std::vector<std::string> parts; + EXPECT_EQ(std::string(), JoinString(parts, separator)); + + parts.push_back(std::string()); + EXPECT_EQ(std::string(), JoinString(parts, separator)); + parts.clear(); + + parts.push_back("a"); + EXPECT_EQ("a", JoinString(parts, separator)); + + parts.push_back("b"); + parts.push_back("c"); + EXPECT_EQ("a, b, c", JoinString(parts, separator)); + + parts.push_back(std::string()); + EXPECT_EQ("a, b, c, ", JoinString(parts, separator)); + parts.push_back(" "); + EXPECT_EQ("a|b|c|| ", JoinString(parts, "|")); +} + +TEST(StringUtilTest, JoinString16) { + string16 separator = ASCIIToUTF16(", "); + std::vector<string16> parts; + EXPECT_EQ(string16(), JoinString(parts, separator)); + + parts.push_back(string16()); + EXPECT_EQ(string16(), JoinString(parts, separator)); + parts.clear(); + + parts.push_back(ASCIIToUTF16("a")); + EXPECT_EQ(ASCIIToUTF16("a"), JoinString(parts, separator)); + + parts.push_back(ASCIIToUTF16("b")); + parts.push_back(ASCIIToUTF16("c")); + EXPECT_EQ(ASCIIToUTF16("a, b, c"), JoinString(parts, separator)); + + parts.push_back(ASCIIToUTF16("")); + EXPECT_EQ(ASCIIToUTF16("a, b, c, "), JoinString(parts, separator)); + parts.push_back(ASCIIToUTF16(" ")); + EXPECT_EQ(ASCIIToUTF16("a|b|c|| "), JoinString(parts, ASCIIToUTF16("|"))); +} + +TEST(StringUtilTest, JoinStringPiece) { + std::string separator(", "); + std::vector<StringPiece> parts; + EXPECT_EQ(std::string(), JoinString(parts, separator)); + + // Test empty first part (https://crbug.com/698073). + parts.push_back(StringPiece()); + EXPECT_EQ(std::string(), JoinString(parts, separator)); + parts.clear(); + + parts.push_back("a"); + EXPECT_EQ("a", JoinString(parts, separator)); + + parts.push_back("b"); + parts.push_back("c"); + EXPECT_EQ("a, b, c", JoinString(parts, separator)); + + parts.push_back(StringPiece()); + EXPECT_EQ("a, b, c, ", JoinString(parts, separator)); + parts.push_back(" "); + EXPECT_EQ("a|b|c|| ", JoinString(parts, "|")); +} + +TEST(StringUtilTest, JoinStringPiece16) { + string16 separator = ASCIIToUTF16(", "); + std::vector<StringPiece16> parts; + EXPECT_EQ(string16(), JoinString(parts, separator)); + + // Test empty first part (https://crbug.com/698073). + parts.push_back(StringPiece16()); + EXPECT_EQ(string16(), JoinString(parts, separator)); + parts.clear(); + + const string16 kA = ASCIIToUTF16("a"); + parts.push_back(kA); + EXPECT_EQ(ASCIIToUTF16("a"), JoinString(parts, separator)); + + const string16 kB = ASCIIToUTF16("b"); + parts.push_back(kB); + const string16 kC = ASCIIToUTF16("c"); + parts.push_back(kC); + EXPECT_EQ(ASCIIToUTF16("a, b, c"), JoinString(parts, separator)); + + parts.push_back(StringPiece16()); + EXPECT_EQ(ASCIIToUTF16("a, b, c, "), JoinString(parts, separator)); + const string16 kSpace = ASCIIToUTF16(" "); + parts.push_back(kSpace); + EXPECT_EQ(ASCIIToUTF16("a|b|c|| "), JoinString(parts, ASCIIToUTF16("|"))); +} + +TEST(StringUtilTest, JoinStringInitializerList) { + std::string separator(", "); + EXPECT_EQ(std::string(), JoinString({}, separator)); + + // Test empty first part (https://crbug.com/698073). + EXPECT_EQ(std::string(), JoinString({StringPiece()}, separator)); + + // With const char*s. + EXPECT_EQ("a", JoinString({"a"}, separator)); + EXPECT_EQ("a, b, c", JoinString({"a", "b", "c"}, separator)); + EXPECT_EQ("a, b, c, ", JoinString({"a", "b", "c", StringPiece()}, separator)); + EXPECT_EQ("a|b|c|| ", JoinString({"a", "b", "c", StringPiece(), " "}, "|")); + + // With std::strings. + const std::string kA = "a"; + const std::string kB = "b"; + EXPECT_EQ("a, b", JoinString({kA, kB}, separator)); + + // With StringPieces. + const StringPiece kPieceA = kA; + const StringPiece kPieceB = kB; + EXPECT_EQ("a, b", JoinString({kPieceA, kPieceB}, separator)); +} + +TEST(StringUtilTest, JoinStringInitializerList16) { + string16 separator = ASCIIToUTF16(", "); + EXPECT_EQ(string16(), JoinString({}, separator)); + + // Test empty first part (https://crbug.com/698073). + EXPECT_EQ(string16(), JoinString({StringPiece16()}, separator)); + + // With string16s. + const string16 kA = ASCIIToUTF16("a"); + EXPECT_EQ(ASCIIToUTF16("a"), JoinString({kA}, separator)); + + const string16 kB = ASCIIToUTF16("b"); + const string16 kC = ASCIIToUTF16("c"); + EXPECT_EQ(ASCIIToUTF16("a, b, c"), JoinString({kA, kB, kC}, separator)); + + EXPECT_EQ(ASCIIToUTF16("a, b, c, "), + JoinString({kA, kB, kC, StringPiece16()}, separator)); + const string16 kSpace = ASCIIToUTF16(" "); + EXPECT_EQ( + ASCIIToUTF16("a|b|c|| "), + JoinString({kA, kB, kC, StringPiece16(), kSpace}, ASCIIToUTF16("|"))); + + // With StringPiece16s. + const StringPiece16 kPieceA = kA; + const StringPiece16 kPieceB = kB; + EXPECT_EQ(ASCIIToUTF16("a, b"), JoinString({kPieceA, kPieceB}, separator)); +} + +TEST(StringUtilTest, StartsWith) { + EXPECT_TRUE(StartsWith("javascript:url", "javascript", + gurl_base::CompareCase::SENSITIVE)); + EXPECT_FALSE(StartsWith("JavaScript:url", "javascript", + gurl_base::CompareCase::SENSITIVE)); + EXPECT_TRUE(StartsWith("javascript:url", "javascript", + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_TRUE(StartsWith("JavaScript:url", "javascript", + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_FALSE(StartsWith("java", "javascript", gurl_base::CompareCase::SENSITIVE)); + EXPECT_FALSE(StartsWith("java", "javascript", + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_FALSE(StartsWith(std::string(), "javascript", + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_FALSE(StartsWith(std::string(), "javascript", + gurl_base::CompareCase::SENSITIVE)); + EXPECT_TRUE(StartsWith("java", std::string(), + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_TRUE(StartsWith("java", std::string(), gurl_base::CompareCase::SENSITIVE)); + + EXPECT_TRUE(StartsWith(ASCIIToUTF16("javascript:url"), + ASCIIToUTF16("javascript"), + gurl_base::CompareCase::SENSITIVE)); + EXPECT_FALSE(StartsWith(ASCIIToUTF16("JavaScript:url"), + ASCIIToUTF16("javascript"), + gurl_base::CompareCase::SENSITIVE)); + EXPECT_TRUE(StartsWith(ASCIIToUTF16("javascript:url"), + ASCIIToUTF16("javascript"), + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_TRUE(StartsWith(ASCIIToUTF16("JavaScript:url"), + ASCIIToUTF16("javascript"), + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_FALSE(StartsWith(ASCIIToUTF16("java"), ASCIIToUTF16("javascript"), + gurl_base::CompareCase::SENSITIVE)); + EXPECT_FALSE(StartsWith(ASCIIToUTF16("java"), ASCIIToUTF16("javascript"), + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_FALSE(StartsWith(string16(), ASCIIToUTF16("javascript"), + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_FALSE(StartsWith(string16(), ASCIIToUTF16("javascript"), + gurl_base::CompareCase::SENSITIVE)); + EXPECT_TRUE(StartsWith(ASCIIToUTF16("java"), string16(), + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_TRUE(StartsWith(ASCIIToUTF16("java"), string16(), + gurl_base::CompareCase::SENSITIVE)); +} + +TEST(StringUtilTest, EndsWith) { + EXPECT_TRUE(EndsWith(ASCIIToUTF16("Foo.plugin"), ASCIIToUTF16(".plugin"), + gurl_base::CompareCase::SENSITIVE)); + EXPECT_FALSE(EndsWith(ASCIIToUTF16("Foo.Plugin"), ASCIIToUTF16(".plugin"), + gurl_base::CompareCase::SENSITIVE)); + EXPECT_TRUE(EndsWith(ASCIIToUTF16("Foo.plugin"), ASCIIToUTF16(".plugin"), + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_TRUE(EndsWith(ASCIIToUTF16("Foo.Plugin"), ASCIIToUTF16(".plugin"), + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_FALSE(EndsWith(ASCIIToUTF16(".plug"), ASCIIToUTF16(".plugin"), + gurl_base::CompareCase::SENSITIVE)); + EXPECT_FALSE(EndsWith(ASCIIToUTF16(".plug"), ASCIIToUTF16(".plugin"), + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_FALSE(EndsWith(ASCIIToUTF16("Foo.plugin Bar"), ASCIIToUTF16(".plugin"), + gurl_base::CompareCase::SENSITIVE)); + EXPECT_FALSE(EndsWith(ASCIIToUTF16("Foo.plugin Bar"), ASCIIToUTF16(".plugin"), + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_FALSE(EndsWith(string16(), ASCIIToUTF16(".plugin"), + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_FALSE(EndsWith(string16(), ASCIIToUTF16(".plugin"), + gurl_base::CompareCase::SENSITIVE)); + EXPECT_TRUE(EndsWith(ASCIIToUTF16("Foo.plugin"), string16(), + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_TRUE(EndsWith(ASCIIToUTF16("Foo.plugin"), string16(), + gurl_base::CompareCase::SENSITIVE)); + EXPECT_TRUE(EndsWith(ASCIIToUTF16(".plugin"), ASCIIToUTF16(".plugin"), + gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_TRUE(EndsWith(ASCIIToUTF16(".plugin"), ASCIIToUTF16(".plugin"), + gurl_base::CompareCase::SENSITIVE)); + EXPECT_TRUE( + EndsWith(string16(), string16(), gurl_base::CompareCase::INSENSITIVE_ASCII)); + EXPECT_TRUE(EndsWith(string16(), string16(), gurl_base::CompareCase::SENSITIVE)); +} + +TEST(StringUtilTest, GetStringFWithOffsets) { + std::vector<string16> subst; + subst.push_back(ASCIIToUTF16("1")); + subst.push_back(ASCIIToUTF16("2")); + std::vector<size_t> offsets; + + ReplaceStringPlaceholders(ASCIIToUTF16("Hello, $1. Your number is $2."), + subst, + &offsets); + EXPECT_EQ(2U, offsets.size()); + EXPECT_EQ(7U, offsets[0]); + EXPECT_EQ(25U, offsets[1]); + offsets.clear(); + + ReplaceStringPlaceholders(ASCIIToUTF16("Hello, $2. Your number is $1."), + subst, + &offsets); + EXPECT_EQ(2U, offsets.size()); + EXPECT_EQ(25U, offsets[0]); + EXPECT_EQ(7U, offsets[1]); + offsets.clear(); +} + +TEST(StringUtilTest, ReplaceStringPlaceholdersTooFew) { + // Test whether replacestringplaceholders works as expected when there + // are fewer inputs than outputs. + std::vector<string16> subst; + subst.push_back(ASCIIToUTF16("9a")); + subst.push_back(ASCIIToUTF16("8b")); + subst.push_back(ASCIIToUTF16("7c")); + + string16 formatted = + ReplaceStringPlaceholders( + ASCIIToUTF16("$1a,$2b,$3c,$4d,$5e,$6f,$1g,$2h,$3i"), subst, nullptr); + + EXPECT_EQ(ASCIIToUTF16("9aa,8bb,7cc,d,e,f,9ag,8bh,7ci"), formatted); +} + +TEST(StringUtilTest, ReplaceStringPlaceholders) { + std::vector<string16> subst; + subst.push_back(ASCIIToUTF16("9a")); + subst.push_back(ASCIIToUTF16("8b")); + subst.push_back(ASCIIToUTF16("7c")); + subst.push_back(ASCIIToUTF16("6d")); + subst.push_back(ASCIIToUTF16("5e")); + subst.push_back(ASCIIToUTF16("4f")); + subst.push_back(ASCIIToUTF16("3g")); + subst.push_back(ASCIIToUTF16("2h")); + subst.push_back(ASCIIToUTF16("1i")); + + string16 formatted = + ReplaceStringPlaceholders( + ASCIIToUTF16("$1a,$2b,$3c,$4d,$5e,$6f,$7g,$8h,$9i"), subst, nullptr); + + EXPECT_EQ(ASCIIToUTF16("9aa,8bb,7cc,6dd,5ee,4ff,3gg,2hh,1ii"), formatted); +} + +TEST(StringUtilTest, ReplaceStringPlaceholdersNetExpansionWithContraction) { + // In this test, some of the substitutions are shorter than the placeholders, + // but overall the string gets longer. + std::vector<string16> subst; + subst.push_back(ASCIIToUTF16("9a____")); + subst.push_back(ASCIIToUTF16("B")); + subst.push_back(ASCIIToUTF16("7c___")); + subst.push_back(ASCIIToUTF16("d")); + subst.push_back(ASCIIToUTF16("5e____")); + subst.push_back(ASCIIToUTF16("F")); + subst.push_back(ASCIIToUTF16("3g___")); + subst.push_back(ASCIIToUTF16("h")); + subst.push_back(ASCIIToUTF16("1i_____")); + + string16 original = ASCIIToUTF16("$1a,$2b,$3c,$4d,$5e,$6f,$7g,$8h,$9i"); + string16 expected = + ASCIIToUTF16("9a____a,Bb,7c___c,dd,5e____e,Ff,3g___g,hh,1i_____i"); + + EXPECT_EQ(expected, ReplaceStringPlaceholders(original, subst, nullptr)); + + std::vector<size_t> offsets; + EXPECT_EQ(expected, ReplaceStringPlaceholders(original, subst, &offsets)); + std::vector<size_t> expected_offsets = {0, 8, 11, 18, 21, 29, 32, 39, 42}; + EXPECT_EQ(offsets.size(), subst.size()); + EXPECT_EQ(expected_offsets, offsets); + for (size_t i = 0; i < offsets.size(); i++) { + EXPECT_EQ(expected.substr(expected_offsets[i], subst[i].length()), + subst[i]); + } +} + +TEST(StringUtilTest, ReplaceStringPlaceholdersNetContractionWithExpansion) { + // In this test, some of the substitutions are longer than the placeholders, + // but overall the string gets smaller. Additionally, the placeholders appear + // in a permuted order. + std::vector<string16> subst; + subst.push_back(ASCIIToUTF16("z")); + subst.push_back(ASCIIToUTF16("y")); + subst.push_back(ASCIIToUTF16("XYZW")); + subst.push_back(ASCIIToUTF16("x")); + subst.push_back(ASCIIToUTF16("w")); + + string16 formatted = + ReplaceStringPlaceholders(ASCIIToUTF16("$3_$4$2$1$5"), subst, nullptr); + + EXPECT_EQ(ASCIIToUTF16("XYZW_xyzw"), formatted); +} + +TEST(StringUtilTest, ReplaceStringPlaceholdersOneDigit) { + std::vector<string16> subst; + subst.push_back(ASCIIToUTF16("1a")); + string16 formatted = + ReplaceStringPlaceholders(ASCIIToUTF16(" $16 "), subst, nullptr); + EXPECT_EQ(ASCIIToUTF16(" 1a6 "), formatted); +} + +TEST(StringUtilTest, ReplaceStringPlaceholdersInvalidPlaceholder) { + std::vector<string16> subst; + subst.push_back(ASCIIToUTF16("1a")); + string16 formatted = + ReplaceStringPlaceholders(ASCIIToUTF16("+$-+$A+$1+"), subst, nullptr); + EXPECT_EQ(ASCIIToUTF16("+++1a+"), formatted); +} + +TEST(StringUtilTest, StdStringReplaceStringPlaceholders) { + std::vector<std::string> subst; + subst.push_back("9a"); + subst.push_back("8b"); + subst.push_back("7c"); + subst.push_back("6d"); + subst.push_back("5e"); + subst.push_back("4f"); + subst.push_back("3g"); + subst.push_back("2h"); + subst.push_back("1i"); + + std::string formatted = + ReplaceStringPlaceholders( + "$1a,$2b,$3c,$4d,$5e,$6f,$7g,$8h,$9i", subst, nullptr); + + EXPECT_EQ("9aa,8bb,7cc,6dd,5ee,4ff,3gg,2hh,1ii", formatted); +} + +TEST(StringUtilTest, StdStringReplaceStringPlaceholdersMultipleMatches) { + std::vector<std::string> subst; + subst.push_back("4"); // Referenced twice. + subst.push_back("?"); // Unreferenced. + subst.push_back("!"); // Unreferenced. + subst.push_back("16"); // Referenced once. + + std::string original = "$1 * $1 == $4"; + std::string expected = "4 * 4 == 16"; + EXPECT_EQ(expected, ReplaceStringPlaceholders(original, subst, nullptr)); + std::vector<size_t> offsets; + EXPECT_EQ(expected, ReplaceStringPlaceholders(original, subst, &offsets)); + std::vector<size_t> expected_offsets = {0, 4, 9}; + EXPECT_EQ(expected_offsets, offsets); +} + +TEST(StringUtilTest, ReplaceStringPlaceholdersConsecutiveDollarSigns) { + std::vector<std::string> subst; + subst.push_back("a"); + subst.push_back("b"); + subst.push_back("c"); + EXPECT_EQ(ReplaceStringPlaceholders("$$1 $$$2 $$$$3", subst, nullptr), + "$1 $$2 $$$3"); +} + +TEST(StringUtilTest, LcpyTest) { + // Test the normal case where we fit in our buffer. + { + char dst[10]; + wchar_t wdst[10]; + EXPECT_EQ(7U, strlcpy(dst, "abcdefg", gurl_base::size(dst))); + EXPECT_EQ(0, memcmp(dst, "abcdefg", 8)); + EXPECT_EQ(7U, wcslcpy(wdst, L"abcdefg", gurl_base::size(wdst))); + EXPECT_EQ(0, memcmp(wdst, L"abcdefg", sizeof(wchar_t) * 8)); + } + + // Test dst_size == 0, nothing should be written to |dst| and we should + // have the equivalent of strlen(src). + { + char dst[2] = {1, 2}; + wchar_t wdst[2] = {1, 2}; + EXPECT_EQ(7U, strlcpy(dst, "abcdefg", 0)); + EXPECT_EQ(1, dst[0]); + EXPECT_EQ(2, dst[1]); + EXPECT_EQ(7U, wcslcpy(wdst, L"abcdefg", 0)); + EXPECT_EQ(static_cast<wchar_t>(1), wdst[0]); + EXPECT_EQ(static_cast<wchar_t>(2), wdst[1]); + } + + // Test the case were we _just_ competely fit including the null. + { + char dst[8]; + wchar_t wdst[8]; + EXPECT_EQ(7U, strlcpy(dst, "abcdefg", gurl_base::size(dst))); + EXPECT_EQ(0, memcmp(dst, "abcdefg", 8)); + EXPECT_EQ(7U, wcslcpy(wdst, L"abcdefg", gurl_base::size(wdst))); + EXPECT_EQ(0, memcmp(wdst, L"abcdefg", sizeof(wchar_t) * 8)); + } + + // Test the case were we we are one smaller, so we can't fit the null. + { + char dst[7]; + wchar_t wdst[7]; + EXPECT_EQ(7U, strlcpy(dst, "abcdefg", gurl_base::size(dst))); + EXPECT_EQ(0, memcmp(dst, "abcdef", 7)); + EXPECT_EQ(7U, wcslcpy(wdst, L"abcdefg", gurl_base::size(wdst))); + EXPECT_EQ(0, memcmp(wdst, L"abcdef", sizeof(wchar_t) * 7)); + } + + // Test the case were we are just too small. + { + char dst[3]; + wchar_t wdst[3]; + EXPECT_EQ(7U, strlcpy(dst, "abcdefg", gurl_base::size(dst))); + EXPECT_EQ(0, memcmp(dst, "ab", 3)); + EXPECT_EQ(7U, wcslcpy(wdst, L"abcdefg", gurl_base::size(wdst))); + EXPECT_EQ(0, memcmp(wdst, L"ab", sizeof(wchar_t) * 3)); + } +} + +TEST(StringUtilTest, WprintfFormatPortabilityTest) { + static const struct { + const wchar_t* input; + bool portable; + } cases[] = { + { L"%ls", true }, + { L"%s", false }, + { L"%S", false }, + { L"%lS", false }, + { L"Hello, %s", false }, + { L"%lc", true }, + { L"%c", false }, + { L"%C", false }, + { L"%lC", false }, + { L"%ls %s", false }, + { L"%s %ls", false }, + { L"%s %ls %s", false }, + { L"%f", true }, + { L"%f %F", false }, + { L"%d %D", false }, + { L"%o %O", false }, + { L"%u %U", false }, + { L"%f %d %o %u", true }, + { L"%-8d (%02.1f%)", true }, + { L"% 10s", false }, + { L"% 10ls", true } + }; + for (const auto& i : cases) + EXPECT_EQ(i.portable, IsWprintfFormatPortable(i.input)); +} + +TEST(StringUtilTest, RemoveChars) { + const char kRemoveChars[] = "-/+*"; + std::string input = "A-+bc/d!*"; + EXPECT_TRUE(RemoveChars(input, kRemoveChars, &input)); + EXPECT_EQ("Abcd!", input); + + // No characters match kRemoveChars. + EXPECT_FALSE(RemoveChars(input, kRemoveChars, &input)); + EXPECT_EQ("Abcd!", input); + + // Empty string. + input.clear(); + EXPECT_FALSE(RemoveChars(input, kRemoveChars, &input)); + EXPECT_EQ(std::string(), input); +} + +TEST(StringUtilTest, ReplaceChars) { + struct TestData { + const char* input; + const char* replace_chars; + const char* replace_with; + const char* output; + bool result; + } cases[] = { + {"", "", "", "", false}, + {"t", "t", "t", "t", true}, + {"a", "b", "c", "a", false}, + {"b", "b", "c", "c", true}, + {"bob", "b", "p", "pop", true}, + {"bob", "o", "i", "bib", true}, + {"test", "", "", "test", false}, + {"test", "", "!", "test", false}, + {"test", "z", "!", "test", false}, + {"test", "e", "!", "t!st", true}, + {"test", "e", "!?", "t!?st", true}, + {"test", "ez", "!", "t!st", true}, + {"test", "zed", "!?", "t!?st", true}, + {"test", "t", "!?", "!?es!?", true}, + {"test", "et", "!>", "!>!>s!>", true}, + {"test", "zest", "!", "!!!!", true}, + {"test", "szt", "!", "!e!!", true}, + {"test", "t", "test", "testestest", true}, + {"tetst", "t", "test", "testeteststest", true}, + {"ttttttt", "t", "-", "-------", true}, + {"aAaAaAAaAAa", "A", "", "aaaaa", true}, + {"xxxxxxxxxx", "x", "", "", true}, + {"xxxxxxxxxx", "x", "x", "xxxxxxxxxx", true}, + {"xxxxxxxxxx", "x", "y-", "y-y-y-y-y-y-y-y-y-y-", true}, + {"xxxxxxxxxx", "x", "xy", "xyxyxyxyxyxyxyxyxyxy", true}, + {"xxxxxxxxxx", "x", "zyx", "zyxzyxzyxzyxzyxzyxzyxzyxzyxzyx", true}, + {"xaxxaxxxaxxxax", "x", "xy", "xyaxyxyaxyxyxyaxyxyxyaxy", true}, + {"-xaxxaxxxaxxxax-", "x", "xy", "-xyaxyxyaxyxyxyaxyxyxyaxy-", true}, + }; + + for (const TestData& scenario : cases) { + // Test with separate output and input vars. + std::string output; + bool result = ReplaceChars(scenario.input, scenario.replace_chars, + scenario.replace_with, &output); + EXPECT_EQ(scenario.result, result) << scenario.input; + EXPECT_EQ(scenario.output, output); + } + + for (const TestData& scenario : cases) { + // Test with an input/output var of limited capacity. + std::string input_output = scenario.input; + input_output.shrink_to_fit(); + bool result = ReplaceChars(input_output, scenario.replace_chars, + scenario.replace_with, &input_output); + EXPECT_EQ(scenario.result, result) << scenario.input; + EXPECT_EQ(scenario.output, input_output); + } + + for (const TestData& scenario : cases) { + // Test with an input/output var of ample capacity; should + // not realloc. + std::string input_output = scenario.input; + input_output.reserve(strlen(scenario.output) * 2); + const void* original_buffer = input_output.data(); + bool result = ReplaceChars(input_output, scenario.replace_chars, + scenario.replace_with, &input_output); + EXPECT_EQ(scenario.result, result) << scenario.input; + EXPECT_EQ(scenario.output, input_output); + EXPECT_EQ(original_buffer, input_output.data()); + } +} + +TEST(StringUtilTest, ContainsOnlyChars) { + // Providing an empty list of characters should return false but for the empty + // string. + EXPECT_TRUE(ContainsOnlyChars(std::string(), std::string())); + EXPECT_FALSE(ContainsOnlyChars("Hello", std::string())); + + EXPECT_TRUE(ContainsOnlyChars(std::string(), "1234")); + EXPECT_TRUE(ContainsOnlyChars("1", "1234")); + EXPECT_TRUE(ContainsOnlyChars("1", "4321")); + EXPECT_TRUE(ContainsOnlyChars("123", "4321")); + EXPECT_FALSE(ContainsOnlyChars("123a", "4321")); + + EXPECT_TRUE(ContainsOnlyChars(std::string(), kWhitespaceASCII)); + EXPECT_TRUE(ContainsOnlyChars(" ", kWhitespaceASCII)); + EXPECT_TRUE(ContainsOnlyChars("\t", kWhitespaceASCII)); + EXPECT_TRUE(ContainsOnlyChars("\t \r \n ", kWhitespaceASCII)); + EXPECT_FALSE(ContainsOnlyChars("a", kWhitespaceASCII)); + EXPECT_FALSE(ContainsOnlyChars("\thello\r \n ", kWhitespaceASCII)); + + EXPECT_TRUE(ContainsOnlyChars(string16(), kWhitespaceUTF16)); + EXPECT_TRUE(ContainsOnlyChars(ASCIIToUTF16(" "), kWhitespaceUTF16)); + EXPECT_TRUE(ContainsOnlyChars(ASCIIToUTF16("\t"), kWhitespaceUTF16)); + EXPECT_TRUE(ContainsOnlyChars(ASCIIToUTF16("\t \r \n "), kWhitespaceUTF16)); + EXPECT_FALSE(ContainsOnlyChars(ASCIIToUTF16("a"), kWhitespaceUTF16)); + EXPECT_FALSE(ContainsOnlyChars(ASCIIToUTF16("\thello\r \n "), + kWhitespaceUTF16)); +} + +TEST(StringUtilTest, CompareCaseInsensitiveASCII) { + EXPECT_EQ(0, CompareCaseInsensitiveASCII("", "")); + EXPECT_EQ(0, CompareCaseInsensitiveASCII("Asdf", "aSDf")); + + // Differing lengths. + EXPECT_EQ(-1, CompareCaseInsensitiveASCII("Asdf", "aSDfA")); + EXPECT_EQ(1, CompareCaseInsensitiveASCII("AsdfA", "aSDf")); + + // Differing values. + EXPECT_EQ(-1, CompareCaseInsensitiveASCII("AsdfA", "aSDfb")); + EXPECT_EQ(1, CompareCaseInsensitiveASCII("Asdfb", "aSDfA")); +} + +TEST(StringUtilTest, EqualsCaseInsensitiveASCII) { + EXPECT_TRUE(EqualsCaseInsensitiveASCII("", "")); + EXPECT_TRUE(EqualsCaseInsensitiveASCII("Asdf", "aSDF")); + EXPECT_FALSE(EqualsCaseInsensitiveASCII("bsdf", "aSDF")); + EXPECT_FALSE(EqualsCaseInsensitiveASCII("Asdf", "aSDFz")); +} + +TEST(StringUtilTest, IsUnicodeWhitespace) { + // NOT unicode white space. + EXPECT_FALSE(IsUnicodeWhitespace(L'\0')); + EXPECT_FALSE(IsUnicodeWhitespace(L'A')); + EXPECT_FALSE(IsUnicodeWhitespace(L'0')); + EXPECT_FALSE(IsUnicodeWhitespace(L'.')); + EXPECT_FALSE(IsUnicodeWhitespace(L';')); + EXPECT_FALSE(IsUnicodeWhitespace(L'\x4100')); + + // Actual unicode whitespace. + EXPECT_TRUE(IsUnicodeWhitespace(L' ')); + EXPECT_TRUE(IsUnicodeWhitespace(L'\xa0')); + EXPECT_TRUE(IsUnicodeWhitespace(L'\x3000')); + EXPECT_TRUE(IsUnicodeWhitespace(L'\t')); + EXPECT_TRUE(IsUnicodeWhitespace(L'\r')); + EXPECT_TRUE(IsUnicodeWhitespace(L'\v')); + EXPECT_TRUE(IsUnicodeWhitespace(L'\f')); + EXPECT_TRUE(IsUnicodeWhitespace(L'\n')); +} + +class WriteIntoTest : public testing::Test { + protected: + static void WritesCorrectly(size_t num_chars) { + std::string buffer; + char kOriginal[] = "supercali"; + strncpy(WriteInto(&buffer, num_chars + 1), kOriginal, num_chars); + // Using std::string(buffer.c_str()) instead of |buffer| truncates the + // string at the first \0. + EXPECT_EQ( + std::string(kOriginal, std::min(num_chars, gurl_base::size(kOriginal) - 1)), + std::string(buffer.c_str())); + EXPECT_EQ(num_chars, buffer.size()); + } +}; + +TEST_F(WriteIntoTest, WriteInto) { + // Validate that WriteInto reserves enough space and + // sizes a string correctly. + WritesCorrectly(1); + WritesCorrectly(2); + WritesCorrectly(5000); + + // Validate that WriteInto doesn't modify other strings + // when using a Copy-on-Write implementation. + const char kLive[] = "live"; + const char kDead[] = "dead"; + const std::string live = kLive; + std::string dead = live; + strncpy(WriteInto(&dead, 5), kDead, 4); + EXPECT_EQ(kDead, dead); + EXPECT_EQ(4u, dead.size()); + EXPECT_EQ(kLive, live); + EXPECT_EQ(4u, live.size()); +} + +} // namespace base
diff --git a/base/strings/string_util_win.h b/base/strings/string_util_win.h new file mode 100644 index 0000000..710d574 --- /dev/null +++ b/base/strings/string_util_win.h
@@ -0,0 +1,44 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_STRING_UTIL_WIN_H_ +#define BASE_STRINGS_STRING_UTIL_WIN_H_ + +#include <stdarg.h> +#include <stddef.h> +#include <stdio.h> +#include <string.h> +#include <wchar.h> + +#include "polyfills/base/logging.h" + +namespace gurl_base { + +// Chromium code style is to not use malloc'd strings; this is only for use +// for interaction with APIs that require it. +inline char* strdup(const char* str) { + return _strdup(str); +} + +inline int vsnprintf(char* buffer, size_t size, + const char* format, va_list arguments) { + int length = vsnprintf_s(buffer, size, size - 1, format, arguments); + if (length < 0) + return _vscprintf(format, arguments); + return length; +} + +inline int vswprintf(wchar_t* buffer, size_t size, + const wchar_t* format, va_list arguments) { + GURL_DCHECK(IsWprintfFormatPortable(format)); + + int length = _vsnwprintf_s(buffer, size, size - 1, format, arguments); + if (length < 0) + return _vscwprintf(format, arguments); + return length; +} + +} // namespace base + +#endif // BASE_STRINGS_STRING_UTIL_WIN_H_
diff --git a/base/strings/stringize_macros.h b/base/strings/stringize_macros.h new file mode 100644 index 0000000..d4e2707 --- /dev/null +++ b/base/strings/stringize_macros.h
@@ -0,0 +1,31 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// This file defines preprocessor macros for stringizing preprocessor +// symbols (or their output) and manipulating preprocessor symbols +// that define strings. + +#ifndef BASE_STRINGS_STRINGIZE_MACROS_H_ +#define BASE_STRINGS_STRINGIZE_MACROS_H_ + +#include "build/build_config.h" + +// This is not very useful as it does not expand defined symbols if +// called directly. Use its counterpart without the _NO_EXPANSION +// suffix, below. +#define STRINGIZE_NO_EXPANSION(x) #x + +// Use this to quote the provided parameter, first expanding it if it +// is a preprocessor symbol. +// +// For example, if: +// #define A FOO +// #define B(x) myobj->FunctionCall(x) +// +// Then: +// STRINGIZE(A) produces "FOO" +// STRINGIZE(B(y)) produces "myobj->FunctionCall(y)" +#define STRINGIZE(x) STRINGIZE_NO_EXPANSION(x) + +#endif // BASE_STRINGS_STRINGIZE_MACROS_H_
diff --git a/base/strings/stringize_macros_unittest.cc b/base/strings/stringize_macros_unittest.cc new file mode 100644 index 0000000..d7f9e56 --- /dev/null +++ b/base/strings/stringize_macros_unittest.cc
@@ -0,0 +1,29 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/stringize_macros.h" + +#include "testing/gtest/include/gtest/gtest.h" + +// Macros as per documentation in header file. +#define PREPROCESSOR_UTIL_UNITTEST_A FOO +#define PREPROCESSOR_UTIL_UNITTEST_B(x) myobj->FunctionCall(x) +#define PREPROCESSOR_UTIL_UNITTEST_C "foo" + +TEST(StringizeTest, Ansi) { + EXPECT_STREQ( + "PREPROCESSOR_UTIL_UNITTEST_A", + STRINGIZE_NO_EXPANSION(PREPROCESSOR_UTIL_UNITTEST_A)); + EXPECT_STREQ( + "PREPROCESSOR_UTIL_UNITTEST_B(y)", + STRINGIZE_NO_EXPANSION(PREPROCESSOR_UTIL_UNITTEST_B(y))); + EXPECT_STREQ( + "PREPROCESSOR_UTIL_UNITTEST_C", + STRINGIZE_NO_EXPANSION(PREPROCESSOR_UTIL_UNITTEST_C)); + + EXPECT_STREQ("FOO", STRINGIZE(PREPROCESSOR_UTIL_UNITTEST_A)); + EXPECT_STREQ("myobj->FunctionCall(y)", + STRINGIZE(PREPROCESSOR_UTIL_UNITTEST_B(y))); + EXPECT_STREQ("\"foo\"", STRINGIZE(PREPROCESSOR_UTIL_UNITTEST_C)); +}
diff --git a/base/strings/stringprintf.cc b/base/strings/stringprintf.cc new file mode 100644 index 0000000..1a08ffb --- /dev/null +++ b/base/strings/stringprintf.cc
@@ -0,0 +1,187 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/stringprintf.h" + +#include <errno.h> +#include <stddef.h> + +#include <vector> + +#include "base/scoped_clear_last_error.h" +#include "base/stl_util.h" +#include "base/strings/string_util.h" +#include "base/strings/utf_string_conversions.h" +#include "build/build_config.h" + +namespace gurl_base { + +namespace { + +// Overloaded wrappers around vsnprintf and vswprintf. The buf_size parameter +// is the size of the buffer. These return the number of characters in the +// formatted string excluding the NUL terminator. If the buffer is not +// large enough to accommodate the formatted string without truncation, they +// return the number of characters that would be in the fully-formatted string +// (vsnprintf, and vswprintf on Windows), or -1 (vswprintf on POSIX platforms). +inline int vsnprintfT(char* buffer, + size_t buf_size, + const char* format, + va_list argptr) { + return gurl_base::vsnprintf(buffer, buf_size, format, argptr); +} + +#if defined(OS_WIN) +inline int vsnprintfT(wchar_t* buffer, + size_t buf_size, + const wchar_t* format, + va_list argptr) { + return gurl_base::vswprintf(buffer, buf_size, format, argptr); +} +#endif + +// Templatized backend for StringPrintF/StringAppendF. This does not finalize +// the va_list, the caller is expected to do that. +template <class StringType> +static void StringAppendVT(StringType* dst, + const typename StringType::value_type* format, + va_list ap) { + // First try with a small fixed size buffer. + // This buffer size should be kept in sync with StringUtilTest.GrowBoundary + // and StringUtilTest.StringPrintfBounds. + typename StringType::value_type stack_buf[1024]; + + va_list ap_copy; + va_copy(ap_copy, ap); + + gurl_base::internal::ScopedClearLastError last_error; + int result = vsnprintfT(stack_buf, gurl_base::size(stack_buf), format, ap_copy); + va_end(ap_copy); + + if (result >= 0 && result < static_cast<int>(gurl_base::size(stack_buf))) { + // It fit. + dst->append(stack_buf, result); + return; + } + + // Repeatedly increase buffer size until it fits. + int mem_length = gurl_base::size(stack_buf); + while (true) { + if (result < 0) { +#if defined(OS_WIN) + // On Windows, vsnprintfT always returns the number of characters in a + // fully-formatted string, so if we reach this point, something else is + // wrong and no amount of buffer-doubling is going to fix it. + return; +#else + if (errno != 0 && errno != EOVERFLOW) + return; + // Try doubling the buffer size. + mem_length *= 2; +#endif + } else { + // We need exactly "result + 1" characters. + mem_length = result + 1; + } + + if (mem_length > 32 * 1024 * 1024) { + // That should be plenty, don't try anything larger. This protects + // against huge allocations when using vsnprintfT implementations that + // return -1 for reasons other than overflow without setting errno. + GURL_DLOG(WARNING) << "Unable to printf the requested string due to size."; + return; + } + + std::vector<typename StringType::value_type> mem_buf(mem_length); + + // NOTE: You can only use a va_list once. Since we're in a while loop, we + // need to make a new copy each time so we don't use up the original. + va_copy(ap_copy, ap); + result = vsnprintfT(&mem_buf[0], mem_length, format, ap_copy); + va_end(ap_copy); + + if ((result >= 0) && (result < mem_length)) { + // It fit. + dst->append(&mem_buf[0], result); + return; + } + } +} + +} // namespace + +std::string StringPrintf(const char* format, ...) { + va_list ap; + va_start(ap, format); + std::string result; + StringAppendV(&result, format, ap); + va_end(ap); + return result; +} + +#if defined(OS_WIN) +std::wstring StringPrintf(const wchar_t* format, ...) { + va_list ap; + va_start(ap, format); + std::wstring result; + StringAppendV(&result, format, ap); + va_end(ap); + return result; +} +#endif + +std::string StringPrintV(const char* format, va_list ap) { + std::string result; + StringAppendV(&result, format, ap); + return result; +} + +const std::string& SStringPrintf(std::string* dst, const char* format, ...) { + va_list ap; + va_start(ap, format); + dst->clear(); + StringAppendV(dst, format, ap); + va_end(ap); + return *dst; +} + +#if defined(OS_WIN) +const std::wstring& SStringPrintf(std::wstring* dst, + const wchar_t* format, ...) { + va_list ap; + va_start(ap, format); + dst->clear(); + StringAppendV(dst, format, ap); + va_end(ap); + return *dst; +} +#endif + +void StringAppendF(std::string* dst, const char* format, ...) { + va_list ap; + va_start(ap, format); + StringAppendV(dst, format, ap); + va_end(ap); +} + +#if defined(OS_WIN) +void StringAppendF(std::wstring* dst, const wchar_t* format, ...) { + va_list ap; + va_start(ap, format); + StringAppendV(dst, format, ap); + va_end(ap); +} +#endif + +void StringAppendV(std::string* dst, const char* format, va_list ap) { + StringAppendVT(dst, format, ap); +} + +#if defined(OS_WIN) +void StringAppendV(std::wstring* dst, const wchar_t* format, va_list ap) { + StringAppendVT(dst, format, ap); +} +#endif + +} // namespace base
diff --git a/base/strings/stringprintf.h b/base/strings/stringprintf.h new file mode 100644 index 0000000..2abdb68 --- /dev/null +++ b/base/strings/stringprintf.h
@@ -0,0 +1,60 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_STRINGPRINTF_H_ +#define BASE_STRINGS_STRINGPRINTF_H_ + +#include <stdarg.h> // va_list + +#include <string> + +#include "polyfills/base/base_export.h" +#include "base/compiler_specific.h" +#include "build/build_config.h" + +namespace gurl_base { + +// Return a C++ string given printf-like input. +BASE_EXPORT std::string StringPrintf(const char* format, ...) + PRINTF_FORMAT(1, 2) WARN_UNUSED_RESULT; +#if defined(OS_WIN) +BASE_EXPORT std::wstring StringPrintf(const wchar_t* format, ...) + WPRINTF_FORMAT(1, 2) WARN_UNUSED_RESULT; +#endif + +// Return a C++ string given vprintf-like input. +BASE_EXPORT std::string StringPrintV(const char* format, va_list ap) + PRINTF_FORMAT(1, 0) WARN_UNUSED_RESULT; + +// Store result into a supplied string and return it. +BASE_EXPORT const std::string& SStringPrintf(std::string* dst, + const char* format, + ...) PRINTF_FORMAT(2, 3); +#if defined(OS_WIN) +BASE_EXPORT const std::wstring& SStringPrintf(std::wstring* dst, + const wchar_t* format, + ...) WPRINTF_FORMAT(2, 3); +#endif + +// Append result to a supplied string. +BASE_EXPORT void StringAppendF(std::string* dst, const char* format, ...) + PRINTF_FORMAT(2, 3); +#if defined(OS_WIN) +BASE_EXPORT void StringAppendF(std::wstring* dst, const wchar_t* format, ...) + WPRINTF_FORMAT(2, 3); +#endif + +// Lower-level routine that takes a va_list and appends to a specified +// string. All other routines are just convenience wrappers around it. +BASE_EXPORT void StringAppendV(std::string* dst, const char* format, va_list ap) + PRINTF_FORMAT(2, 0); +#if defined(OS_WIN) +BASE_EXPORT void StringAppendV(std::wstring* dst, + const wchar_t* format, + va_list ap) WPRINTF_FORMAT(2, 0); +#endif + +} // namespace base + +#endif // BASE_STRINGS_STRINGPRINTF_H_
diff --git a/base/strings/stringprintf_unittest.cc b/base/strings/stringprintf_unittest.cc new file mode 100644 index 0000000..59e3403 --- /dev/null +++ b/base/strings/stringprintf_unittest.cc
@@ -0,0 +1,182 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/stringprintf.h" + +#include <errno.h> +#include <stddef.h> + +#include "base/macros.h" +#include "build/build_config.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace gurl_base { + +namespace { + +// A helper for the StringAppendV test that follows. +// +// Just forwards its args to StringAppendV. +static void StringAppendVTestHelper(std::string* out, const char* format, ...) { + va_list ap; + va_start(ap, format); + StringAppendV(out, format, ap); + va_end(ap); +} + +} // namespace + +TEST(StringPrintfTest, StringPrintfEmpty) { + EXPECT_EQ("", StringPrintf("%s", "")); +} + +TEST(StringPrintfTest, StringPrintfMisc) { + EXPECT_EQ("123hello w", StringPrintf("%3d%2s %1c", 123, "hello", 'w')); +#if defined(OS_WIN) + EXPECT_EQ(L"123hello w", StringPrintf(L"%3d%2ls %1lc", 123, L"hello", 'w')); +#endif +} + +TEST(StringPrintfTest, StringAppendfEmptyString) { + std::string value("Hello"); + StringAppendF(&value, "%s", ""); + EXPECT_EQ("Hello", value); + +#if defined(OS_WIN) + std::wstring valuew(L"Hello"); + StringAppendF(&valuew, L"%ls", L""); + EXPECT_EQ(L"Hello", valuew); +#endif +} + +TEST(StringPrintfTest, StringAppendfString) { + std::string value("Hello"); + StringAppendF(&value, " %s", "World"); + EXPECT_EQ("Hello World", value); + +#if defined(OS_WIN) + std::wstring valuew(L"Hello"); + StringAppendF(&valuew, L" %ls", L"World"); + EXPECT_EQ(L"Hello World", valuew); +#endif +} + +TEST(StringPrintfTest, StringAppendfInt) { + std::string value("Hello"); + StringAppendF(&value, " %d", 123); + EXPECT_EQ("Hello 123", value); + +#if defined(OS_WIN) + std::wstring valuew(L"Hello"); + StringAppendF(&valuew, L" %d", 123); + EXPECT_EQ(L"Hello 123", valuew); +#endif +} + +// Make sure that lengths exactly around the initial buffer size are handled +// correctly. +TEST(StringPrintfTest, StringPrintfBounds) { + const int kSrcLen = 1026; + char src[kSrcLen]; + for (auto& i : src) + i = 'A'; + + wchar_t srcw[kSrcLen]; + for (auto& i : srcw) + i = 'A'; + + for (int i = 1; i < 3; i++) { + src[kSrcLen - i] = 0; + std::string out; + SStringPrintf(&out, "%s", src); + EXPECT_STREQ(src, out.c_str()); + +#if defined(OS_WIN) + srcw[kSrcLen - i] = 0; + std::wstring outw; + SStringPrintf(&outw, L"%ls", srcw); + EXPECT_STREQ(srcw, outw.c_str()); +#endif + } +} + +// Test very large sprintfs that will cause the buffer to grow. +TEST(StringPrintfTest, Grow) { + char src[1026]; + for (auto& i : src) + i = 'A'; + src[1025] = 0; + + const char fmt[] = "%sB%sB%sB%sB%sB%sB%s"; + + std::string out; + SStringPrintf(&out, fmt, src, src, src, src, src, src, src); + + const int kRefSize = 320000; + char* ref = new char[kRefSize]; +#if defined(OS_WIN) + sprintf_s(ref, kRefSize, fmt, src, src, src, src, src, src, src); +#elif defined(OS_POSIX) || defined(OS_FUCHSIA) + snprintf(ref, kRefSize, fmt, src, src, src, src, src, src, src); +#endif + + EXPECT_STREQ(ref, out.c_str()); + delete[] ref; +} + +TEST(StringPrintfTest, StringAppendV) { + std::string out; + StringAppendVTestHelper(&out, "%d foo %s", 1, "bar"); + EXPECT_EQ("1 foo bar", out); +} + +// Test the boundary condition for the size of the string_util's +// internal buffer. +TEST(StringPrintfTest, GrowBoundary) { + const int kStringUtilBufLen = 1024; + // Our buffer should be one larger than the size of StringAppendVT's stack + // buffer. + // And need extra one for NULL-terminator. + const int kBufLen = kStringUtilBufLen + 1 + 1; + char src[kBufLen]; + for (int i = 0; i < kBufLen - 1; ++i) + src[i] = 'a'; + src[kBufLen - 1] = 0; + + std::string out; + SStringPrintf(&out, "%s", src); + + EXPECT_STREQ(src, out.c_str()); +} + +#if defined(OS_WIN) +// vswprintf in Visual Studio 2013 fails when given U+FFFF. This tests that the +// failure case is gracefuly handled. In Visual Studio 2015 the bad character +// is passed through. +TEST(StringPrintfTest, Invalid) { + wchar_t invalid[2]; + invalid[0] = 0xffff; + invalid[1] = 0; + + std::wstring out; + SStringPrintf(&out, L"%ls", invalid); +#if _MSC_VER >= 1900 + EXPECT_STREQ(invalid, out.c_str()); +#else + EXPECT_STREQ(L"", out.c_str()); +#endif +} +#endif + +// Test that StringPrintf and StringAppendV do not change errno. +TEST(StringPrintfTest, StringPrintfErrno) { + errno = 1; + EXPECT_EQ("", StringPrintf("%s", "")); + EXPECT_EQ(1, errno); + std::string out; + StringAppendVTestHelper(&out, "%d foo %s", 1, "bar"); + EXPECT_EQ(1, errno); +} + +} // namespace base
diff --git a/base/strings/sys_string_conversions.h b/base/strings/sys_string_conversions.h new file mode 100644 index 0000000..08082ae --- /dev/null +++ b/base/strings/sys_string_conversions.h
@@ -0,0 +1,84 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_SYS_STRING_CONVERSIONS_H_ +#define BASE_STRINGS_SYS_STRING_CONVERSIONS_H_ + +// Provides system-dependent string type conversions for cases where it's +// necessary to not use ICU. Generally, you should not need this in Chrome, +// but it is used in some shared code. Dependencies should be minimal. + +#include <stdint.h> + +#include <string> + +#include "polyfills/base/base_export.h" +#include "base/strings/string16.h" +#include "base/strings/string_piece.h" +#include "build/build_config.h" + +#if defined(OS_MACOSX) +#include <CoreFoundation/CoreFoundation.h> +#ifdef __OBJC__ +@class NSString; +#else +class NSString; +#endif +#endif // OS_MACOSX + +namespace gurl_base { + +// Converts between wide and UTF-8 representations of a string. On error, the +// result is system-dependent. +BASE_EXPORT std::string SysWideToUTF8(const std::wstring& wide); +BASE_EXPORT std::wstring SysUTF8ToWide(StringPiece utf8); + +// Converts between wide and the system multi-byte representations of a string. +// DANGER: This will lose information and can change (on Windows, this can +// change between reboots). +BASE_EXPORT std::string SysWideToNativeMB(const std::wstring& wide); +BASE_EXPORT std::wstring SysNativeMBToWide(StringPiece native_mb); + +// Windows-specific ------------------------------------------------------------ + +#if defined(OS_WIN) + +// Converts between 8-bit and wide strings, using the given code page. The +// code page identifier is one accepted by the Windows function +// MultiByteToWideChar(). +BASE_EXPORT std::wstring SysMultiByteToWide(StringPiece mb, uint32_t code_page); +BASE_EXPORT std::string SysWideToMultiByte(const std::wstring& wide, + uint32_t code_page); + +#endif // defined(OS_WIN) + +// Mac-specific ---------------------------------------------------------------- + +#if defined(OS_MACOSX) + +// Converts between STL strings and CFStringRefs/NSStrings. + +// Creates a string, and returns it with a refcount of 1. You are responsible +// for releasing it. Returns NULL on failure. +BASE_EXPORT CFStringRef SysUTF8ToCFStringRef(StringPiece utf8); +BASE_EXPORT CFStringRef SysUTF16ToCFStringRef(StringPiece16 utf16); + +// Same, but returns an autoreleased NSString. +BASE_EXPORT NSString* SysUTF8ToNSString(StringPiece utf8); +BASE_EXPORT NSString* SysUTF16ToNSString(StringPiece16 utf16); + +// Converts a CFStringRef to an STL string. Returns an empty string on failure. +BASE_EXPORT std::string SysCFStringRefToUTF8(CFStringRef ref); +BASE_EXPORT string16 SysCFStringRefToUTF16(CFStringRef ref); + +// Same, but accepts NSString input. Converts nil NSString* to the appropriate +// string type of length 0. +BASE_EXPORT std::string SysNSStringToUTF8(NSString* ref); +BASE_EXPORT string16 SysNSStringToUTF16(NSString* ref); + +#endif // defined(OS_MACOSX) + +} // namespace base + +#endif // BASE_STRINGS_SYS_STRING_CONVERSIONS_H_
diff --git a/base/strings/sys_string_conversions_posix.cc b/base/strings/sys_string_conversions_posix.cc new file mode 100644 index 0000000..80f01e6 --- /dev/null +++ b/base/strings/sys_string_conversions_posix.cc
@@ -0,0 +1,162 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/sys_string_conversions.h" + +#include <stddef.h> +#include <wchar.h> + +#include "base/strings/string_piece.h" +#include "base/strings/utf_string_conversions.h" +#include "build/build_config.h" + +namespace gurl_base { + +std::string SysWideToUTF8(const std::wstring& wide) { + // In theory this should be using the system-provided conversion rather + // than our ICU, but this will do for now. + return WideToUTF8(wide); +} +std::wstring SysUTF8ToWide(StringPiece utf8) { + // In theory this should be using the system-provided conversion rather + // than our ICU, but this will do for now. + std::wstring out; + UTF8ToWide(utf8.data(), utf8.size(), &out); + return out; +} + +#if defined(SYSTEM_NATIVE_UTF8) || defined(OS_ANDROID) +// TODO(port): Consider reverting the OS_ANDROID when we have wcrtomb() +// support and a better understanding of what calls these routines. + +std::string SysWideToNativeMB(const std::wstring& wide) { + return WideToUTF8(wide); +} + +std::wstring SysNativeMBToWide(StringPiece native_mb) { + return SysUTF8ToWide(native_mb); +} + +#else + +std::string SysWideToNativeMB(const std::wstring& wide) { + mbstate_t ps; + + // Calculate the number of multi-byte characters. We walk through the string + // without writing the output, counting the number of multi-byte characters. + size_t num_out_chars = 0; + memset(&ps, 0, sizeof(ps)); + for (auto src : wide) { + // Use a temp buffer since calling wcrtomb with an output of NULL does not + // calculate the output length. + char buf[16]; + // Skip NULLs to avoid wcrtomb's special handling of them. + size_t res = src ? wcrtomb(buf, src, &ps) : 0; + switch (res) { + // Handle any errors and return an empty string. + case static_cast<size_t>(-1): + return std::string(); + break; + case 0: + // We hit an embedded null byte, keep going. + ++num_out_chars; + break; + default: + num_out_chars += res; + break; + } + } + + if (num_out_chars == 0) + return std::string(); + + std::string out; + out.resize(num_out_chars); + + // We walk the input string again, with |i| tracking the index of the + // wide input, and |j| tracking the multi-byte output. + memset(&ps, 0, sizeof(ps)); + for (size_t i = 0, j = 0; i < wide.size(); ++i) { + const wchar_t src = wide[i]; + // We don't want wcrtomb to do its funkiness for embedded NULLs. + size_t res = src ? wcrtomb(&out[j], src, &ps) : 0; + switch (res) { + // Handle any errors and return an empty string. + case static_cast<size_t>(-1): + return std::string(); + break; + case 0: + // We hit an embedded null byte, keep going. + ++j; // Output is already zeroed. + break; + default: + j += res; + break; + } + } + + return out; +} + +std::wstring SysNativeMBToWide(StringPiece native_mb) { + mbstate_t ps; + + // Calculate the number of wide characters. We walk through the string + // without writing the output, counting the number of wide characters. + size_t num_out_chars = 0; + memset(&ps, 0, sizeof(ps)); + for (size_t i = 0; i < native_mb.size(); ) { + const char* src = native_mb.data() + i; + size_t res = mbrtowc(nullptr, src, native_mb.size() - i, &ps); + switch (res) { + // Handle any errors and return an empty string. + case static_cast<size_t>(-2): + case static_cast<size_t>(-1): + return std::wstring(); + break; + case 0: + // We hit an embedded null byte, keep going. + i += 1; + FALLTHROUGH; + default: + i += res; + ++num_out_chars; + break; + } + } + + if (num_out_chars == 0) + return std::wstring(); + + std::wstring out; + out.resize(num_out_chars); + + memset(&ps, 0, sizeof(ps)); // Clear the shift state. + // We walk the input string again, with |i| tracking the index of the + // multi-byte input, and |j| tracking the wide output. + for (size_t i = 0, j = 0; i < native_mb.size(); ++j) { + const char* src = native_mb.data() + i; + wchar_t* dst = &out[j]; + size_t res = mbrtowc(dst, src, native_mb.size() - i, &ps); + switch (res) { + // Handle any errors and return an empty string. + case static_cast<size_t>(-2): + case static_cast<size_t>(-1): + return std::wstring(); + break; + case 0: + i += 1; // Skip null byte. + break; + default: + i += res; + break; + } + } + + return out; +} + +#endif // defined(SYSTEM_NATIVE_UTF8) || defined(OS_ANDROID) + +} // namespace base
diff --git a/base/strings/sys_string_conversions_unittest.cc b/base/strings/sys_string_conversions_unittest.cc new file mode 100644 index 0000000..0e78d43 --- /dev/null +++ b/base/strings/sys_string_conversions_unittest.cc
@@ -0,0 +1,196 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <stddef.h> + +#include <string> + +#include "base/macros.h" +#include "base/strings/string_piece.h" +#include "base/strings/sys_string_conversions.h" +#include "base/strings/utf_string_conversions.h" +#include "base/test/scoped_locale.h" +#include "build/build_config.h" +#include "testing/gtest/include/gtest/gtest.h" + +#ifdef WCHAR_T_IS_UTF32 +static const std::wstring kSysWideOldItalicLetterA = L"\x10300"; +#else +static const std::wstring kSysWideOldItalicLetterA = L"\xd800\xdf00"; +#endif + +namespace gurl_base { + +TEST(SysStrings, SysWideToUTF8) { + EXPECT_EQ("Hello, world", SysWideToUTF8(L"Hello, world")); + EXPECT_EQ("\xe4\xbd\xa0\xe5\xa5\xbd", SysWideToUTF8(L"\x4f60\x597d")); + + // >16 bits + EXPECT_EQ("\xF0\x90\x8C\x80", SysWideToUTF8(kSysWideOldItalicLetterA)); + + // Error case. When Windows finds a UTF-16 character going off the end of + // a string, it just converts that literal value to UTF-8, even though this + // is invalid. + // + // This is what XP does, but Vista has different behavior, so we don't bother + // verifying it: + // EXPECT_EQ("\xE4\xBD\xA0\xED\xA0\x80zyxw", + // SysWideToUTF8(L"\x4f60\xd800zyxw")); + + // Test embedded NULLs. + std::wstring wide_null(L"a"); + wide_null.push_back(0); + wide_null.push_back('b'); + + std::string expected_null("a"); + expected_null.push_back(0); + expected_null.push_back('b'); + + EXPECT_EQ(expected_null, SysWideToUTF8(wide_null)); +} + +TEST(SysStrings, SysUTF8ToWide) { + EXPECT_EQ(L"Hello, world", SysUTF8ToWide("Hello, world")); + EXPECT_EQ(L"\x4f60\x597d", SysUTF8ToWide("\xe4\xbd\xa0\xe5\xa5\xbd")); + // >16 bits + EXPECT_EQ(kSysWideOldItalicLetterA, SysUTF8ToWide("\xF0\x90\x8C\x80")); + + // Error case. When Windows finds an invalid UTF-8 character, it just skips + // it. This seems weird because it's inconsistent with the reverse conversion. + // + // This is what XP does, but Vista has different behavior, so we don't bother + // verifying it: + // EXPECT_EQ(L"\x4f60zyxw", SysUTF8ToWide("\xe4\xbd\xa0\xe5\xa5zyxw")); + + // Test embedded NULLs. + std::string utf8_null("a"); + utf8_null.push_back(0); + utf8_null.push_back('b'); + + std::wstring expected_null(L"a"); + expected_null.push_back(0); + expected_null.push_back('b'); + + EXPECT_EQ(expected_null, SysUTF8ToWide(utf8_null)); +} + +#if defined(OS_LINUX) // Tests depend on setting a specific Linux locale. + +TEST(SysStrings, SysWideToNativeMB) { +#if !defined(SYSTEM_NATIVE_UTF8) + ScopedLocale locale("en_US.UTF-8"); +#endif + EXPECT_EQ("Hello, world", SysWideToNativeMB(L"Hello, world")); + EXPECT_EQ("\xe4\xbd\xa0\xe5\xa5\xbd", SysWideToNativeMB(L"\x4f60\x597d")); + + // >16 bits + EXPECT_EQ("\xF0\x90\x8C\x80", SysWideToNativeMB(kSysWideOldItalicLetterA)); + + // Error case. When Windows finds a UTF-16 character going off the end of + // a string, it just converts that literal value to UTF-8, even though this + // is invalid. + // + // This is what XP does, but Vista has different behavior, so we don't bother + // verifying it: + // EXPECT_EQ("\xE4\xBD\xA0\xED\xA0\x80zyxw", + // SysWideToNativeMB(L"\x4f60\xd800zyxw")); + + // Test embedded NULLs. + std::wstring wide_null(L"a"); + wide_null.push_back(0); + wide_null.push_back('b'); + + std::string expected_null("a"); + expected_null.push_back(0); + expected_null.push_back('b'); + + EXPECT_EQ(expected_null, SysWideToNativeMB(wide_null)); +} + +// We assume the test is running in a UTF8 locale. +TEST(SysStrings, SysNativeMBToWide) { +#if !defined(SYSTEM_NATIVE_UTF8) + ScopedLocale locale("en_US.UTF-8"); +#endif + EXPECT_EQ(L"Hello, world", SysNativeMBToWide("Hello, world")); + EXPECT_EQ(L"\x4f60\x597d", SysNativeMBToWide("\xe4\xbd\xa0\xe5\xa5\xbd")); + // >16 bits + EXPECT_EQ(kSysWideOldItalicLetterA, SysNativeMBToWide("\xF0\x90\x8C\x80")); + + // Error case. When Windows finds an invalid UTF-8 character, it just skips + // it. This seems weird because it's inconsistent with the reverse conversion. + // + // This is what XP does, but Vista has different behavior, so we don't bother + // verifying it: + // EXPECT_EQ(L"\x4f60zyxw", SysNativeMBToWide("\xe4\xbd\xa0\xe5\xa5zyxw")); + + // Test embedded NULLs. + std::string utf8_null("a"); + utf8_null.push_back(0); + utf8_null.push_back('b'); + + std::wstring expected_null(L"a"); + expected_null.push_back(0); + expected_null.push_back('b'); + + EXPECT_EQ(expected_null, SysNativeMBToWide(utf8_null)); +} + +static const wchar_t* const kConvertRoundtripCases[] = { + L"Google Video", + // "网页 图片 资讯更多 »" + L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", + // "Παγκόσμιος Ιστός" + L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" + L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", + // "Поиск страниц на русском" + L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" + L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" + L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", + // "전체서비스" + L"\xc804\xccb4\xc11c\xbe44\xc2a4", + + // Test characters that take more than 16 bits. This will depend on whether + // wchar_t is 16 or 32 bits. +#if defined(WCHAR_T_IS_UTF16) + L"\xd800\xdf00", + // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) + L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", +#elif defined(WCHAR_T_IS_UTF32) + L"\x10300", + // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) + L"\x11d40\x11d41\x11d42\x11d43\x11d44", +#endif +}; + + +TEST(SysStrings, SysNativeMBAndWide) { +#if !defined(SYSTEM_NATIVE_UTF8) + ScopedLocale locale("en_US.UTF-8"); +#endif + for (auto* i : kConvertRoundtripCases) { + std::wstring wide = i; + std::wstring trip = SysNativeMBToWide(SysWideToNativeMB(wide)); + EXPECT_EQ(wide.size(), trip.size()); + EXPECT_EQ(wide, trip); + } + + // We assume our test is running in UTF-8, so double check through ICU. + for (auto* i : kConvertRoundtripCases) { + std::wstring wide = i; + std::wstring trip = SysNativeMBToWide(WideToUTF8(wide)); + EXPECT_EQ(wide.size(), trip.size()); + EXPECT_EQ(wide, trip); + } + + for (auto* i : kConvertRoundtripCases) { + std::wstring wide = i; + std::wstring trip = UTF8ToWide(SysWideToNativeMB(wide)); + EXPECT_EQ(wide.size(), trip.size()); + EXPECT_EQ(wide, trip); + } +} +#endif // OS_LINUX + +} // namespace base
diff --git a/base/strings/sys_string_conversions_win.cc b/base/strings/sys_string_conversions_win.cc new file mode 100644 index 0000000..3f08956 --- /dev/null +++ b/base/strings/sys_string_conversions_win.cc
@@ -0,0 +1,71 @@ +// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/sys_string_conversions.h" + +#include <windows.h> +#include <stdint.h> + +#include "base/strings/string_piece.h" + +namespace gurl_base { + +// Do not assert in this function since it is used by the asssertion code! +std::string SysWideToUTF8(const std::wstring& wide) { + return SysWideToMultiByte(wide, CP_UTF8); +} + +// Do not assert in this function since it is used by the asssertion code! +std::wstring SysUTF8ToWide(StringPiece utf8) { + return SysMultiByteToWide(utf8, CP_UTF8); +} + +std::string SysWideToNativeMB(const std::wstring& wide) { + return SysWideToMultiByte(wide, CP_ACP); +} + +std::wstring SysNativeMBToWide(StringPiece native_mb) { + return SysMultiByteToWide(native_mb, CP_ACP); +} + +// Do not assert in this function since it is used by the asssertion code! +std::wstring SysMultiByteToWide(StringPiece mb, uint32_t code_page) { + if (mb.empty()) + return std::wstring(); + + int mb_length = static_cast<int>(mb.length()); + // Compute the length of the buffer. + int charcount = MultiByteToWideChar(code_page, 0, + mb.data(), mb_length, NULL, 0); + if (charcount == 0) + return std::wstring(); + + std::wstring wide; + wide.resize(charcount); + MultiByteToWideChar(code_page, 0, mb.data(), mb_length, &wide[0], charcount); + + return wide; +} + +// Do not assert in this function since it is used by the asssertion code! +std::string SysWideToMultiByte(const std::wstring& wide, uint32_t code_page) { + int wide_length = static_cast<int>(wide.length()); + if (wide_length == 0) + return std::string(); + + // Compute the length of the buffer we'll need. + int charcount = WideCharToMultiByte(code_page, 0, wide.data(), wide_length, + NULL, 0, NULL, NULL); + if (charcount == 0) + return std::string(); + + std::string mb; + mb.resize(charcount); + WideCharToMultiByte(code_page, 0, wide.data(), wide_length, + &mb[0], charcount, NULL, NULL); + + return mb; +} + +} // namespace base
diff --git a/base/strings/utf_offset_string_conversions.cc b/base/strings/utf_offset_string_conversions.cc new file mode 100644 index 0000000..5bf7967 --- /dev/null +++ b/base/strings/utf_offset_string_conversions.cc
@@ -0,0 +1,263 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/utf_offset_string_conversions.h" + +#include <stdint.h> + +#include <algorithm> +#include <memory> + +#include "polyfills/base/logging.h" +#include "base/strings/string_piece.h" +#include "base/strings/utf_string_conversion_utils.h" + +namespace gurl_base { + +OffsetAdjuster::Adjustment::Adjustment(size_t original_offset, + size_t original_length, + size_t output_length) + : original_offset(original_offset), + original_length(original_length), + output_length(output_length) { +} + +// static +void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments, + std::vector<size_t>* offsets_for_adjustment, + size_t limit) { + GURL_DCHECK(offsets_for_adjustment); + for (auto& i : *offsets_for_adjustment) + AdjustOffset(adjustments, &i, limit); +} + +// static +void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments, + size_t* offset, + size_t limit) { + GURL_DCHECK(offset); + if (*offset == string16::npos) + return; + int adjustment = 0; + for (const auto& i : adjustments) { + if (*offset <= i.original_offset) + break; + if (*offset < (i.original_offset + i.original_length)) { + *offset = string16::npos; + return; + } + adjustment += static_cast<int>(i.original_length - i.output_length); + } + *offset -= adjustment; + + if (*offset > limit) + *offset = string16::npos; +} + +// static +void OffsetAdjuster::UnadjustOffsets( + const Adjustments& adjustments, + std::vector<size_t>* offsets_for_unadjustment) { + if (!offsets_for_unadjustment || adjustments.empty()) + return; + for (auto& i : *offsets_for_unadjustment) + UnadjustOffset(adjustments, &i); +} + +// static +void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments, + size_t* offset) { + if (*offset == string16::npos) + return; + int adjustment = 0; + for (const auto& i : adjustments) { + if (*offset + adjustment <= i.original_offset) + break; + adjustment += static_cast<int>(i.original_length - i.output_length); + if ((*offset + adjustment) < (i.original_offset + i.original_length)) { + *offset = string16::npos; + return; + } + } + *offset += adjustment; +} + +// static +void OffsetAdjuster::MergeSequentialAdjustments( + const Adjustments& first_adjustments, + Adjustments* adjustments_on_adjusted_string) { + auto adjusted_iter = adjustments_on_adjusted_string->begin(); + auto first_iter = first_adjustments.begin(); + // Simultaneously iterate over all |adjustments_on_adjusted_string| and + // |first_adjustments|, adding adjustments to or correcting the adjustments + // in |adjustments_on_adjusted_string| as we go. |shift| keeps track of the + // current number of characters collapsed by |first_adjustments| up to this + // point. |currently_collapsing| keeps track of the number of characters + // collapsed by |first_adjustments| into the current |adjusted_iter|'s + // length. These are characters that will change |shift| as soon as we're + // done processing the current |adjusted_iter|; they are not yet reflected in + // |shift|. + size_t shift = 0; + size_t currently_collapsing = 0; + while (adjusted_iter != adjustments_on_adjusted_string->end()) { + if ((first_iter == first_adjustments.end()) || + ((adjusted_iter->original_offset + shift + + adjusted_iter->original_length) <= first_iter->original_offset)) { + // Entire |adjusted_iter| (accounting for its shift and including its + // whole original length) comes before |first_iter|. + // + // Correct the offset at |adjusted_iter| and move onto the next + // adjustment that needs revising. + adjusted_iter->original_offset += shift; + shift += currently_collapsing; + currently_collapsing = 0; + ++adjusted_iter; + } else if ((adjusted_iter->original_offset + shift) > + first_iter->original_offset) { + // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|). + + // It's not possible for the adjustments to overlap. (It shouldn't + // be possible that we have an |adjusted_iter->original_offset| that, + // when adjusted by the computed |shift|, is in the middle of + // |first_iter|'s output's length. After all, that would mean the + // current adjustment_on_adjusted_string somehow points to an offset + // that was supposed to have been eliminated by the first set of + // adjustments.) + GURL_DCHECK_LE(first_iter->original_offset + first_iter->output_length, + adjusted_iter->original_offset + shift); + + // Add the |first_adjustment_iter| to the full set of adjustments while + // making sure |adjusted_iter| continues pointing to the same element. + // We do this by inserting the |first_adjustment_iter| right before + // |adjusted_iter|, then incrementing |adjusted_iter| so it points to + // the following element. + shift += first_iter->original_length - first_iter->output_length; + adjusted_iter = adjustments_on_adjusted_string->insert( + adjusted_iter, *first_iter); + ++adjusted_iter; + ++first_iter; + } else { + // The first adjustment adjusted something that then got further adjusted + // by the second set of adjustments. In other words, |first_iter| points + // to something in the range covered by |adjusted_iter|'s length (after + // accounting for |shift|). Precisely, + // adjusted_iter->original_offset + shift + // <= + // first_iter->original_offset + // <= + // adjusted_iter->original_offset + shift + + // adjusted_iter->original_length + + // Modify the current |adjusted_iter| to include whatever collapsing + // happened in |first_iter|, then advance to the next |first_adjustments| + // because we dealt with the current one. + const int collapse = static_cast<int>(first_iter->original_length) - + static_cast<int>(first_iter->output_length); + // This function does not know how to deal with a string that expands and + // then gets modified, only strings that collapse and then get modified. + GURL_DCHECK_GT(collapse, 0); + adjusted_iter->original_length += collapse; + currently_collapsing += collapse; + ++first_iter; + } + } + GURL_DCHECK_EQ(0u, currently_collapsing); + if (first_iter != first_adjustments.end()) { + // Only first adjustments are left. These do not need to be modified. + // (Their offsets are already correct with respect to the original string.) + // Append them all. + GURL_DCHECK(adjusted_iter == adjustments_on_adjusted_string->end()); + adjustments_on_adjusted_string->insert( + adjustments_on_adjusted_string->end(), first_iter, + first_adjustments.end()); + } +} + +// Converts the given source Unicode character type to the given destination +// Unicode character type as a STL string. The given input buffer and size +// determine the source, and the given output STL string will be replaced by +// the result. If non-NULL, |adjustments| is set to reflect the all the +// alterations to the string that are not one-character-to-one-character. +// It will always be sorted by increasing offset. +template<typename SrcChar, typename DestStdString> +bool ConvertUnicode(const SrcChar* src, + size_t src_len, + DestStdString* output, + OffsetAdjuster::Adjustments* adjustments) { + if (adjustments) + adjustments->clear(); + // ICU requires 32-bit numbers. + bool success = true; + int32_t src_len32 = static_cast<int32_t>(src_len); + for (int32_t i = 0; i < src_len32; i++) { + uint32_t code_point; + size_t original_i = i; + size_t chars_written = 0; + if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { + chars_written = WriteUnicodeCharacter(code_point, output); + } else { + chars_written = WriteUnicodeCharacter(0xFFFD, output); + success = false; + } + + // Only bother writing an adjustment if this modification changed the + // length of this character. + // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last + // character read, not after it (so that incrementing it in the loop + // increment will place it at the right location), so we need to account + // for that in determining the amount that was read. + if (adjustments && ((i - original_i + 1) != chars_written)) { + adjustments->push_back(OffsetAdjuster::Adjustment( + original_i, i - original_i + 1, chars_written)); + } + } + return success; +} + +bool UTF8ToUTF16WithAdjustments( + const char* src, + size_t src_len, + string16* output, + gurl_base::OffsetAdjuster::Adjustments* adjustments) { + PrepareForUTF16Or32Output(src, src_len, output); + return ConvertUnicode(src, src_len, output, adjustments); +} + +string16 UTF8ToUTF16WithAdjustments( + const gurl_base::StringPiece& utf8, + gurl_base::OffsetAdjuster::Adjustments* adjustments) { + string16 result; + UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments); + return result; +} + +string16 UTF8ToUTF16AndAdjustOffsets( + const gurl_base::StringPiece& utf8, + std::vector<size_t>* offsets_for_adjustment) { + for (size_t& offset : *offsets_for_adjustment) { + if (offset > utf8.length()) + offset = string16::npos; + } + OffsetAdjuster::Adjustments adjustments; + string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments); + OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); + return result; +} + +std::string UTF16ToUTF8AndAdjustOffsets( + const gurl_base::StringPiece16& utf16, + std::vector<size_t>* offsets_for_adjustment) { + for (size_t& offset : *offsets_for_adjustment) { + if (offset > utf16.length()) + offset = string16::npos; + } + std::string result; + PrepareForUTF8Output(utf16.data(), utf16.length(), &result); + OffsetAdjuster::Adjustments adjustments; + ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments); + OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); + return result; +} + +} // namespace base
diff --git a/base/strings/utf_offset_string_conversions.h b/base/strings/utf_offset_string_conversions.h new file mode 100644 index 0000000..8902ee5 --- /dev/null +++ b/base/strings/utf_offset_string_conversions.h
@@ -0,0 +1,114 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_UTF_OFFSET_STRING_CONVERSIONS_H_ +#define BASE_STRINGS_UTF_OFFSET_STRING_CONVERSIONS_H_ + +#include <stddef.h> + +#include <string> +#include <vector> + +#include "polyfills/base/base_export.h" +#include "base/strings/string16.h" +#include "base/strings/string_piece.h" + +namespace gurl_base { + +// A helper class and associated data structures to adjust offsets into a +// string in response to various adjustments one might do to that string +// (e.g., eliminating a range). For details on offsets, see the comments by +// the AdjustOffsets() function below. +class BASE_EXPORT OffsetAdjuster { + public: + struct BASE_EXPORT Adjustment { + Adjustment(size_t original_offset, + size_t original_length, + size_t output_length); + + size_t original_offset; + size_t original_length; + size_t output_length; + }; + typedef std::vector<Adjustment> Adjustments; + + // Adjusts all offsets in |offsets_for_adjustment| to reflect the adjustments + // recorded in |adjustments|. Adjusted offsets greater than |limit| will be + // set to string16::npos. + // + // Offsets represents insertion/selection points between characters: if |src| + // is "abcd", then 0 is before 'a', 2 is between 'b' and 'c', and 4 is at the + // end of the string. Valid input offsets range from 0 to |src_len|. On + // exit, each offset will have been modified to point at the same logical + // position in the output string. If an offset cannot be successfully + // adjusted (e.g., because it points into the middle of a multibyte sequence), + // it will be set to string16::npos. + static void AdjustOffsets(const Adjustments& adjustments, + std::vector<size_t>* offsets_for_adjustment, + size_t limit = string16::npos); + + // Adjusts the single |offset| to reflect the adjustments recorded in + // |adjustments|. + static void AdjustOffset(const Adjustments& adjustments, + size_t* offset, + size_t limit = string16::npos); + + // Adjusts all offsets in |offsets_for_unadjustment| to reflect the reverse + // of the adjustments recorded in |adjustments|. In other words, the offsets + // provided represent offsets into an adjusted string and the caller wants + // to know the offsets they correspond to in the original string. If an + // offset cannot be successfully unadjusted (e.g., because it points into + // the middle of a multibyte sequence), it will be set to string16::npos. + static void UnadjustOffsets(const Adjustments& adjustments, + std::vector<size_t>* offsets_for_unadjustment); + + // Adjusts the single |offset| to reflect the reverse of the adjustments + // recorded in |adjustments|. + static void UnadjustOffset(const Adjustments& adjustments, + size_t* offset); + + // Combines two sequential sets of adjustments, storing the combined revised + // adjustments in |adjustments_on_adjusted_string|. That is, suppose a + // string was altered in some way, with the alterations recorded as + // adjustments in |first_adjustments|. Then suppose the resulting string is + // further altered, with the alterations recorded as adjustments scored in + // |adjustments_on_adjusted_string|, with the offsets recorded in these + // adjustments being with respect to the intermediate string. This function + // combines the two sets of adjustments into one, storing the result in + // |adjustments_on_adjusted_string|, whose offsets are correct with respect + // to the original string. + // + // Assumes both parameters are sorted by increasing offset. + // + // WARNING: Only supports |first_adjustments| that involve collapsing ranges + // of text, not expanding ranges. + static void MergeSequentialAdjustments( + const Adjustments& first_adjustments, + Adjustments* adjustments_on_adjusted_string); +}; + +// Like the conversions in utf_string_conversions.h, but also fills in an +// |adjustments| parameter that reflects the alterations done to the string. +// It may be NULL. +BASE_EXPORT bool UTF8ToUTF16WithAdjustments( + const char* src, + size_t src_len, + string16* output, + gurl_base::OffsetAdjuster::Adjustments* adjustments); +BASE_EXPORT string16 UTF8ToUTF16WithAdjustments( + const gurl_base::StringPiece& utf8, + gurl_base::OffsetAdjuster::Adjustments* adjustments); +// As above, but instead internally examines the adjustments and applies them +// to |offsets_for_adjustment|. Input offsets greater than the length of the +// input string will be set to string16::npos. See comments by AdjustOffsets(). +BASE_EXPORT string16 UTF8ToUTF16AndAdjustOffsets( + const gurl_base::StringPiece& utf8, + std::vector<size_t>* offsets_for_adjustment); +BASE_EXPORT std::string UTF16ToUTF8AndAdjustOffsets( + const gurl_base::StringPiece16& utf16, + std::vector<size_t>* offsets_for_adjustment); + +} // namespace base + +#endif // BASE_STRINGS_UTF_OFFSET_STRING_CONVERSIONS_H_
diff --git a/base/strings/utf_offset_string_conversions_unittest.cc b/base/strings/utf_offset_string_conversions_unittest.cc new file mode 100644 index 0000000..4691cb3 --- /dev/null +++ b/base/strings/utf_offset_string_conversions_unittest.cc
@@ -0,0 +1,298 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <stddef.h> + +#include <algorithm> + +#include "polyfills/base/logging.h" +#include "base/stl_util.h" +#include "base/strings/string_piece.h" +#include "base/strings/utf_offset_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace gurl_base { + +namespace { + +static const size_t kNpos = string16::npos; + +} // namespace + +TEST(UTFOffsetStringConversionsTest, AdjustOffset) { + struct UTF8ToUTF16Case { + const char* utf8; + size_t input_offset; + size_t output_offset; + } utf8_to_utf16_cases[] = { + {"", 0, 0}, + {"", kNpos, kNpos}, + {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos}, + {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1}, + {"\xed\xb0\x80z", 3, 3}, + {"A\xF0\x90\x8C\x80z", 1, 1}, + {"A\xF0\x90\x8C\x80z", 2, kNpos}, + {"A\xF0\x90\x8C\x80z", 5, 3}, + {"A\xF0\x90\x8C\x80z", 6, 4}, + {"A\xF0\x90\x8C\x80z", kNpos, kNpos}, + }; + for (const auto& i : utf8_to_utf16_cases) { + const size_t offset = i.input_offset; + std::vector<size_t> offsets; + offsets.push_back(offset); + UTF8ToUTF16AndAdjustOffsets(i.utf8, &offsets); + EXPECT_EQ(i.output_offset, offsets[0]); + } + + struct UTF16ToUTF8Case { + char16 utf16[10]; + size_t input_offset; + size_t output_offset; + } utf16_to_utf8_cases[] = { + {{}, 0, 0}, + // Converted to 3-byte utf-8 sequences + {{0x5909, 0x63DB}, 3, kNpos}, + {{0x5909, 0x63DB}, 2, 6}, + {{0x5909, 0x63DB}, 1, 3}, + {{0x5909, 0x63DB}, 0, 0}, + // Converted to 2-byte utf-8 sequences + {{'A', 0x00bc, 0x00be, 'z'}, 1, 1}, + {{'A', 0x00bc, 0x00be, 'z'}, 2, 3}, + {{'A', 0x00bc, 0x00be, 'z'}, 3, 5}, + {{'A', 0x00bc, 0x00be, 'z'}, 4, 6}, + // Surrogate pair + {{'A', 0xd800, 0xdf00, 'z'}, 1, 1}, + {{'A', 0xd800, 0xdf00, 'z'}, 2, kNpos}, + {{'A', 0xd800, 0xdf00, 'z'}, 3, 5}, + {{'A', 0xd800, 0xdf00, 'z'}, 4, 6}, + }; + for (size_t i = 0; i < gurl_base::size(utf16_to_utf8_cases); ++i) { + size_t offset = utf16_to_utf8_cases[i].input_offset; + std::vector<size_t> offsets; + offsets.push_back(offset); + UTF16ToUTF8AndAdjustOffsets(utf16_to_utf8_cases[i].utf16, &offsets); + EXPECT_EQ(utf16_to_utf8_cases[i].output_offset, offsets[0]) << i; + } +} + +TEST(UTFOffsetStringConversionsTest, LimitOffsets) { + const OffsetAdjuster::Adjustments kNoAdjustments; + const size_t kLimit = 10; + const size_t kItems = 20; + std::vector<size_t> size_ts; + for (size_t t = 0; t < kItems; ++t) { + size_ts.push_back(t); + OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit); + } + size_t unlimited_count = 0; + for (auto ti : size_ts) { + if (ti != kNpos) + ++unlimited_count; + } + EXPECT_EQ(11U, unlimited_count); + + // Reverse the values in the vector and try again. + size_ts.clear(); + for (size_t t = kItems; t > 0; --t) { + size_ts.push_back(t - 1); + OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit); + } + unlimited_count = 0; + for (auto ti : size_ts) { + if (ti != kNpos) + ++unlimited_count; + } + EXPECT_EQ(11U, unlimited_count); +} + +TEST(UTFOffsetStringConversionsTest, AdjustOffsets) { + // Imagine we have strings as shown in the following cases where the + // X's represent encoded characters. + // 1: abcXXXdef ==> abcXdef + { + std::vector<size_t> offsets; + for (size_t t = 0; t <= 9; ++t) + offsets.push_back(t); + OffsetAdjuster::Adjustments adjustments; + adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1)); + OffsetAdjuster::AdjustOffsets(adjustments, &offsets); + size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6, 7}; + EXPECT_EQ(offsets.size(), gurl_base::size(expected_1)); + for (size_t i = 0; i < gurl_base::size(expected_1); ++i) + EXPECT_EQ(expected_1[i], offsets[i]); + } + + // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX + { + std::vector<size_t> offsets; + for (size_t t = 0; t <= 23; ++t) + offsets.push_back(t); + OffsetAdjuster::Adjustments adjustments; + adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1)); + adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2)); + adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4)); + adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1)); + OffsetAdjuster::AdjustOffsets(adjustments, &offsets); + size_t expected_2[] = { + 0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6, kNpos, kNpos, kNpos, + kNpos, kNpos, kNpos, 10, 11, 12, 13, kNpos, kNpos, 14 + }; + EXPECT_EQ(offsets.size(), gurl_base::size(expected_2)); + for (size_t i = 0; i < gurl_base::size(expected_2); ++i) + EXPECT_EQ(expected_2[i], offsets[i]); + } + + // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe + { + std::vector<size_t> offsets; + for (size_t t = 0; t <= 17; ++t) + offsets.push_back(t); + OffsetAdjuster::Adjustments adjustments; + adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0)); + adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4)); + adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3)); + adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0)); + OffsetAdjuster::AdjustOffsets(adjustments, &offsets); + size_t expected_3[] = { + 0, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6, 7, 8, kNpos, kNpos, 11, + 12, kNpos, 12 + }; + EXPECT_EQ(offsets.size(), gurl_base::size(expected_3)); + for (size_t i = 0; i < gurl_base::size(expected_3); ++i) + EXPECT_EQ(expected_3[i], offsets[i]); + } +} + +TEST(UTFOffsetStringConversionsTest, UnadjustOffsets) { + // Imagine we have strings as shown in the following cases where the + // X's represent encoded characters. + // 1: abcXXXdef ==> abcXdef + { + std::vector<size_t> offsets; + for (size_t t = 0; t <= 7; ++t) + offsets.push_back(t); + OffsetAdjuster::Adjustments adjustments; + adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1)); + OffsetAdjuster::UnadjustOffsets(adjustments, &offsets); + size_t expected_1[] = {0, 1, 2, 3, 6, 7, 8, 9}; + EXPECT_EQ(offsets.size(), gurl_base::size(expected_1)); + for (size_t i = 0; i < gurl_base::size(expected_1); ++i) + EXPECT_EQ(expected_1[i], offsets[i]); + } + + // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX + { + std::vector<size_t> offsets; + for (size_t t = 0; t <= 14; ++t) + offsets.push_back(t); + OffsetAdjuster::Adjustments adjustments; + adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1)); + adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2)); + adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4)); + adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1)); + OffsetAdjuster::UnadjustOffsets(adjustments, &offsets); + size_t expected_2[] = { + 0, 3, 4, kNpos, 8, 9, 10, kNpos, kNpos, kNpos, 17, 18, 19, 20, 23 + }; + EXPECT_EQ(offsets.size(), gurl_base::size(expected_2)); + for (size_t i = 0; i < gurl_base::size(expected_2); ++i) + EXPECT_EQ(expected_2[i], offsets[i]); + } + + // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe + { + std::vector<size_t> offsets; + for (size_t t = 0; t <= 12; ++t) + offsets.push_back(t); + OffsetAdjuster::Adjustments adjustments; + adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0)); + adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4)); + adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3)); + adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0)); + OffsetAdjuster::UnadjustOffsets(adjustments, &offsets); + size_t expected_3[] = { + 0, // this could just as easily be 3 + 4, kNpos, kNpos, kNpos, 8, 9, 10, 11, kNpos, kNpos, 14, + 15 // this could just as easily be 17 + }; + EXPECT_EQ(offsets.size(), gurl_base::size(expected_3)); + for (size_t i = 0; i < gurl_base::size(expected_3); ++i) + EXPECT_EQ(expected_3[i], offsets[i]); + } +} + +// MergeSequentialAdjustments is used by net/base/escape.{h,cc} and +// net/base/net_util.{h,cc}. The two tests EscapeTest.AdjustOffset and +// NetUtilTest.FormatUrlWithOffsets test its behavior extensively. This +// is simply a short, additional test. +TEST(UTFOffsetStringConversionsTest, MergeSequentialAdjustments) { + // Pretend the input string is "abcdefghijklmnopqrstuvwxyz". + + // Set up |first_adjustments| to + // - remove the leading "a" + // - combine the "bc" into one character (call it ".") + // - remove the "f" + // - remove the "tuv" + // The resulting string should be ".deghijklmnopqrswxyz". + OffsetAdjuster::Adjustments first_adjustments; + first_adjustments.push_back(OffsetAdjuster::Adjustment(0, 1, 0)); + first_adjustments.push_back(OffsetAdjuster::Adjustment(1, 2, 1)); + first_adjustments.push_back(OffsetAdjuster::Adjustment(5, 1, 0)); + first_adjustments.push_back(OffsetAdjuster::Adjustment(19, 3, 0)); + + // Set up |adjustments_on_adjusted_string| to + // - combine the "." character that replaced "bc" with "d" into one character + // (call it "?") + // - remove the "egh" + // - expand the "i" into two characters (call them "12") + // - combine the "jkl" into one character (call it "@") + // - expand the "z" into two characters (call it "34") + // The resulting string should be "?12@mnopqrswxy34". + OffsetAdjuster::Adjustments adjustments_on_adjusted_string; + adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment( + 0, 2, 1)); + adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment( + 2, 3, 0)); + adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment( + 5, 1, 2)); + adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment( + 6, 3, 1)); + adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment( + 19, 1, 2)); + + // Now merge the adjustments and check the results. + OffsetAdjuster::MergeSequentialAdjustments(first_adjustments, + &adjustments_on_adjusted_string); + // The merged adjustments should look like + // - combine abcd into "?" + // - note: it's also reasonable for the Merge function to instead produce + // two adjustments instead of this, one to remove a and another to + // combine bcd into "?". This test verifies the current behavior. + // - remove efgh + // - expand i into "12" + // - combine jkl into "@" + // - remove tuv + // - expand z into "34" + ASSERT_EQ(6u, adjustments_on_adjusted_string.size()); + EXPECT_EQ(0u, adjustments_on_adjusted_string[0].original_offset); + EXPECT_EQ(4u, adjustments_on_adjusted_string[0].original_length); + EXPECT_EQ(1u, adjustments_on_adjusted_string[0].output_length); + EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_offset); + EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_length); + EXPECT_EQ(0u, adjustments_on_adjusted_string[1].output_length); + EXPECT_EQ(8u, adjustments_on_adjusted_string[2].original_offset); + EXPECT_EQ(1u, adjustments_on_adjusted_string[2].original_length); + EXPECT_EQ(2u, adjustments_on_adjusted_string[2].output_length); + EXPECT_EQ(9u, adjustments_on_adjusted_string[3].original_offset); + EXPECT_EQ(3u, adjustments_on_adjusted_string[3].original_length); + EXPECT_EQ(1u, adjustments_on_adjusted_string[3].output_length); + EXPECT_EQ(19u, adjustments_on_adjusted_string[4].original_offset); + EXPECT_EQ(3u, adjustments_on_adjusted_string[4].original_length); + EXPECT_EQ(0u, adjustments_on_adjusted_string[4].output_length); + EXPECT_EQ(25u, adjustments_on_adjusted_string[5].original_offset); + EXPECT_EQ(1u, adjustments_on_adjusted_string[5].original_length); + EXPECT_EQ(2u, adjustments_on_adjusted_string[5].output_length); +} + +} // namespace base
diff --git a/base/strings/utf_string_conversion_utils.cc b/base/strings/utf_string_conversion_utils.cc new file mode 100644 index 0000000..ce432e7 --- /dev/null +++ b/base/strings/utf_string_conversion_utils.cc
@@ -0,0 +1,155 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/utf_string_conversion_utils.h" + +#include "base/third_party/icu/icu_utf.h" +#include "build/build_config.h" + +namespace gurl_base { + +// ReadUnicodeCharacter -------------------------------------------------------- + +bool ReadUnicodeCharacter(const char* src, + int32_t src_len, + int32_t* char_index, + uint32_t* code_point_out) { + // U8_NEXT expects to be able to use -1 to signal an error, so we must + // use a signed type for code_point. But this function returns false + // on error anyway, so code_point_out is unsigned. + int32_t code_point; + CBU8_NEXT(src, *char_index, src_len, code_point); + *code_point_out = static_cast<uint32_t>(code_point); + + // The ICU macro above moves to the next char, we want to point to the last + // char consumed. + (*char_index)--; + + // Validate the decoded value. + return IsValidCodepoint(code_point); +} + +bool ReadUnicodeCharacter(const char16* src, + int32_t src_len, + int32_t* char_index, + uint32_t* code_point) { + if (CBU16_IS_SURROGATE(src[*char_index])) { + if (!CBU16_IS_SURROGATE_LEAD(src[*char_index]) || + *char_index + 1 >= src_len || + !CBU16_IS_TRAIL(src[*char_index + 1])) { + // Invalid surrogate pair. + return false; + } + + // Valid surrogate pair. + *code_point = CBU16_GET_SUPPLEMENTARY(src[*char_index], + src[*char_index + 1]); + (*char_index)++; + } else { + // Not a surrogate, just one 16-bit word. + *code_point = src[*char_index]; + } + + return IsValidCodepoint(*code_point); +} + +#if defined(WCHAR_T_IS_UTF32) +bool ReadUnicodeCharacter(const wchar_t* src, + int32_t src_len, + int32_t* char_index, + uint32_t* code_point) { + // Conversion is easy since the source is 32-bit. + *code_point = src[*char_index]; + + // Validate the value. + return IsValidCodepoint(*code_point); +} +#endif // defined(WCHAR_T_IS_UTF32) + +// WriteUnicodeCharacter ------------------------------------------------------- + +size_t WriteUnicodeCharacter(uint32_t code_point, std::string* output) { + if (code_point <= 0x7f) { + // Fast path the common case of one byte. + output->push_back(static_cast<char>(code_point)); + return 1; + } + + + // CBU8_APPEND_UNSAFE can append up to 4 bytes. + size_t char_offset = output->length(); + size_t original_char_offset = char_offset; + output->resize(char_offset + CBU8_MAX_LENGTH); + + CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); + + // CBU8_APPEND_UNSAFE will advance our pointer past the inserted character, so + // it will represent the new length of the string. + output->resize(char_offset); + return char_offset - original_char_offset; +} + +size_t WriteUnicodeCharacter(uint32_t code_point, string16* output) { + if (CBU16_LENGTH(code_point) == 1) { + // Thie code point is in the Basic Multilingual Plane (BMP). + output->push_back(static_cast<char16>(code_point)); + return 1; + } + // Non-BMP characters use a double-character encoding. + size_t char_offset = output->length(); + output->resize(char_offset + CBU16_MAX_LENGTH); + CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); + return CBU16_MAX_LENGTH; +} + +// Generalized Unicode converter ----------------------------------------------- + +template<typename CHAR> +void PrepareForUTF8Output(const CHAR* src, + size_t src_len, + std::string* output) { + output->clear(); + if (src_len == 0) + return; + if (src[0] < 0x80) { + // Assume that the entire input will be ASCII. + output->reserve(src_len); + } else { + // Assume that the entire input is non-ASCII and will have 3 bytes per char. + output->reserve(src_len * 3); + } +} + +// Instantiate versions we know callers will need. +#if !defined(OS_WIN) +// wchar_t and char16 are the same thing on Windows. +template void PrepareForUTF8Output(const wchar_t*, size_t, std::string*); +#endif +template void PrepareForUTF8Output(const char16*, size_t, std::string*); + +template<typename STRING> +void PrepareForUTF16Or32Output(const char* src, + size_t src_len, + STRING* output) { + output->clear(); + if (src_len == 0) + return; + if (static_cast<unsigned char>(src[0]) < 0x80) { + // Assume the input is all ASCII, which means 1:1 correspondence. + output->reserve(src_len); + } else { + // Otherwise assume that the UTF-8 sequences will have 2 bytes for each + // character. + output->reserve(src_len / 2); + } +} + +// Instantiate versions we know callers will need. +#if !defined(OS_WIN) +// std::wstring and string16 are the same thing on Windows. +template void PrepareForUTF16Or32Output(const char*, size_t, std::wstring*); +#endif +template void PrepareForUTF16Or32Output(const char*, size_t, string16*); + +} // namespace base
diff --git a/base/strings/utf_string_conversion_utils.h b/base/strings/utf_string_conversion_utils.h new file mode 100644 index 0000000..84d18f7 --- /dev/null +++ b/base/strings/utf_string_conversion_utils.h
@@ -0,0 +1,100 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_ +#define BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_ + +// Low-level UTF handling functions. Most code will want to use the functions +// in utf_string_conversions.h + +#include <stddef.h> +#include <stdint.h> + +#include "polyfills/base/base_export.h" +#include "base/strings/string16.h" + +namespace gurl_base { + +inline bool IsValidCodepoint(uint32_t code_point) { + // Excludes the surrogate code points ([0xD800, 0xDFFF]) and + // codepoints larger than 0x10FFFF (the highest codepoint allowed). + // Non-characters and unassigned codepoints are allowed. + return code_point < 0xD800u || + (code_point >= 0xE000u && code_point <= 0x10FFFFu); +} + +inline bool IsValidCharacter(uint32_t code_point) { + // Excludes non-characters (U+FDD0..U+FDEF, and all codepoints ending in + // 0xFFFE or 0xFFFF) from the set of valid code points. + return code_point < 0xD800u || (code_point >= 0xE000u && + code_point < 0xFDD0u) || (code_point > 0xFDEFu && + code_point <= 0x10FFFFu && (code_point & 0xFFFEu) != 0xFFFEu); +} + +// ReadUnicodeCharacter -------------------------------------------------------- + +// Reads a UTF-8 stream, placing the next code point into the given output +// |*code_point|. |src| represents the entire string to read, and |*char_index| +// is the character offset within the string to start reading at. |*char_index| +// will be updated to index the last character read, such that incrementing it +// (as in a for loop) will take the reader to the next character. +// +// Returns true on success. On false, |*code_point| will be invalid. +BASE_EXPORT bool ReadUnicodeCharacter(const char* src, + int32_t src_len, + int32_t* char_index, + uint32_t* code_point_out); + +// Reads a UTF-16 character. The usage is the same as the 8-bit version above. +BASE_EXPORT bool ReadUnicodeCharacter(const char16* src, + int32_t src_len, + int32_t* char_index, + uint32_t* code_point); + +#if defined(WCHAR_T_IS_UTF32) +// Reads UTF-32 character. The usage is the same as the 8-bit version above. +BASE_EXPORT bool ReadUnicodeCharacter(const wchar_t* src, + int32_t src_len, + int32_t* char_index, + uint32_t* code_point); +#endif // defined(WCHAR_T_IS_UTF32) + +// WriteUnicodeCharacter ------------------------------------------------------- + +// Appends a UTF-8 character to the given 8-bit string. Returns the number of +// bytes written. +BASE_EXPORT size_t WriteUnicodeCharacter(uint32_t code_point, + std::string* output); + +// Appends the given code point as a UTF-16 character to the given 16-bit +// string. Returns the number of 16-bit values written. +BASE_EXPORT size_t WriteUnicodeCharacter(uint32_t code_point, string16* output); + +#if defined(WCHAR_T_IS_UTF32) +// Appends the given UTF-32 character to the given 32-bit string. Returns the +// number of 32-bit values written. +inline size_t WriteUnicodeCharacter(uint32_t code_point, std::wstring* output) { + // This is the easy case, just append the character. + output->push_back(code_point); + return 1; +} +#endif // defined(WCHAR_T_IS_UTF32) + +// Generalized Unicode converter ----------------------------------------------- + +// Guesses the length of the output in UTF-8 in bytes, clears that output +// string, and reserves that amount of space. We assume that the input +// character types are unsigned, which will be true for UTF-16 and -32 on our +// systems. +template<typename CHAR> +void PrepareForUTF8Output(const CHAR* src, size_t src_len, std::string* output); + +// Prepares an output buffer (containing either UTF-16 or -32 data) given some +// UTF-8 input that will be converted to it. See PrepareForUTF8Output(). +template<typename STRING> +void PrepareForUTF16Or32Output(const char* src, size_t src_len, STRING* output); + +} // namespace base + +#endif // BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
diff --git a/base/strings/utf_string_conversions.cc b/base/strings/utf_string_conversions.cc new file mode 100644 index 0000000..aaf4a40 --- /dev/null +++ b/base/strings/utf_string_conversions.cc
@@ -0,0 +1,342 @@ +// Copyright (c) 2018 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/utf_string_conversions.h" + +#include <limits.h> +#include <stdint.h> + +#include <type_traits> + +#include "base/strings/string_piece.h" +#include "base/strings/string_util.h" +#include "base/strings/utf_string_conversion_utils.h" +#include "base/third_party/icu/icu_utf.h" +#include "build/build_config.h" + +namespace gurl_base { + +namespace { + +constexpr int32_t kErrorCodePoint = 0xFFFD; + +// Size coefficient ---------------------------------------------------------- +// The maximum number of codeunits in the destination encoding corresponding to +// one codeunit in the source encoding. + +template <typename SrcChar, typename DestChar> +struct SizeCoefficient { + static_assert(sizeof(SrcChar) < sizeof(DestChar), + "Default case: from a smaller encoding to the bigger one"); + + // ASCII symbols are encoded by one codeunit in all encodings. + static constexpr int value = 1; +}; + +template <> +struct SizeCoefficient<char16, char> { + // One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8. + static constexpr int value = 3; +}; + +#if defined(WCHAR_T_IS_UTF32) +template <> +struct SizeCoefficient<wchar_t, char> { + // UTF-8 uses at most 4 codeunits per character. + static constexpr int value = 4; +}; + +template <> +struct SizeCoefficient<wchar_t, char16> { + // UTF-16 uses at most 2 codeunits per character. + static constexpr int value = 2; +}; +#endif // defined(WCHAR_T_IS_UTF32) + +template <typename SrcChar, typename DestChar> +constexpr int size_coefficient_v = + SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value; + +// UnicodeAppendUnsafe -------------------------------------------------------- +// Function overloads that write code_point to the output string. Output string +// has to have enough space for the codepoint. + +// Convenience typedef that checks whether the passed in type is integral (i.e. +// bool, char, int or their extended versions) and is of the correct size. +template <typename Char, size_t N> +using EnableIfBitsAre = std::enable_if_t<std::is_integral<Char>::value && + CHAR_BIT * sizeof(Char) == N, + bool>; + +template <typename Char, EnableIfBitsAre<Char, 8> = true> +void UnicodeAppendUnsafe(Char* out, int32_t* size, uint32_t code_point) { + CBU8_APPEND_UNSAFE(out, *size, code_point); +} + +template <typename Char, EnableIfBitsAre<Char, 16> = true> +void UnicodeAppendUnsafe(Char* out, int32_t* size, uint32_t code_point) { + CBU16_APPEND_UNSAFE(out, *size, code_point); +} + +template <typename Char, EnableIfBitsAre<Char, 32> = true> +void UnicodeAppendUnsafe(Char* out, int32_t* size, uint32_t code_point) { + out[(*size)++] = code_point; +} + +// DoUTFConversion ------------------------------------------------------------ +// Main driver of UTFConversion specialized for different Src encodings. +// dest has to have enough room for the converted text. + +template <typename DestChar> +bool DoUTFConversion(const char* src, + int32_t src_len, + DestChar* dest, + int32_t* dest_len) { + bool success = true; + + for (int32_t i = 0; i < src_len;) { + int32_t code_point; + CBU8_NEXT(src, i, src_len, code_point); + + if (!IsValidCodepoint(code_point)) { + success = false; + code_point = kErrorCodePoint; + } + + UnicodeAppendUnsafe(dest, dest_len, code_point); + } + + return success; +} + +template <typename DestChar> +bool DoUTFConversion(const char16* src, + int32_t src_len, + DestChar* dest, + int32_t* dest_len) { + bool success = true; + + auto ConvertSingleChar = [&success](char16 in) -> int32_t { + if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) { + success = false; + return kErrorCodePoint; + } + return in; + }; + + int32_t i = 0; + + // Always have another symbol in order to avoid checking boundaries in the + // middle of the surrogate pair. + while (i < src_len - 1) { + int32_t code_point; + + if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) { + code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]); + if (!IsValidCodepoint(code_point)) { + code_point = kErrorCodePoint; + success = false; + } + i += 2; + } else { + code_point = ConvertSingleChar(src[i]); + ++i; + } + + UnicodeAppendUnsafe(dest, dest_len, code_point); + } + + if (i < src_len) + UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i])); + + return success; +} + +#if defined(WCHAR_T_IS_UTF32) + +template <typename DestChar> +bool DoUTFConversion(const wchar_t* src, + int32_t src_len, + DestChar* dest, + int32_t* dest_len) { + bool success = true; + + for (int32_t i = 0; i < src_len; ++i) { + int32_t code_point = src[i]; + + if (!IsValidCodepoint(code_point)) { + success = false; + code_point = kErrorCodePoint; + } + + UnicodeAppendUnsafe(dest, dest_len, code_point); + } + + return success; +} + +#endif // defined(WCHAR_T_IS_UTF32) + +// UTFConversion -------------------------------------------------------------- +// Function template for generating all UTF conversions. + +template <typename InputString, typename DestString> +bool UTFConversion(const InputString& src_str, DestString* dest_str) { + if (IsStringASCII(src_str)) { + dest_str->assign(src_str.begin(), src_str.end()); + return true; + } + + dest_str->resize(src_str.length() * + size_coefficient_v<typename InputString::value_type, + typename DestString::value_type>); + + // Empty string is ASCII => it OK to call operator[]. + auto* dest = &(*dest_str)[0]; + + // ICU requires 32 bit numbers. + int32_t src_len32 = static_cast<int32_t>(src_str.length()); + int32_t dest_len32 = 0; + + bool res = DoUTFConversion(src_str.data(), src_len32, dest, &dest_len32); + + dest_str->resize(dest_len32); + dest_str->shrink_to_fit(); + + return res; +} + +} // namespace + +// UTF16 <-> UTF8 -------------------------------------------------------------- + +bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { + return UTFConversion(StringPiece(src, src_len), output); +} + +string16 UTF8ToUTF16(StringPiece utf8) { + string16 ret; + // Ignore the success flag of this call, it will do the best it can for + // invalid input, which is what we want here. + UTF8ToUTF16(utf8.data(), utf8.size(), &ret); + return ret; +} + +bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { + return UTFConversion(StringPiece16(src, src_len), output); +} + +std::string UTF16ToUTF8(StringPiece16 utf16) { + std::string ret; + // Ignore the success flag of this call, it will do the best it can for + // invalid input, which is what we want here. + UTF16ToUTF8(utf16.data(), utf16.length(), &ret); + return ret; +} + +// UTF-16 <-> Wide ------------------------------------------------------------- + +#if defined(WCHAR_T_IS_UTF16) +// When wide == UTF-16 the conversions are a NOP. + +bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { + output->assign(src, src + src_len); + return true; +} + +string16 WideToUTF16(WStringPiece wide) { + return string16(wide.begin(), wide.end()); +} + +bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { + output->assign(src, src + src_len); + return true; +} + +std::wstring UTF16ToWide(StringPiece16 utf16) { + return std::wstring(utf16.begin(), utf16.end()); +} + +#elif defined(WCHAR_T_IS_UTF32) + +bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { + return UTFConversion(gurl_base::WStringPiece(src, src_len), output); +} + +string16 WideToUTF16(WStringPiece wide) { + string16 ret; + // Ignore the success flag of this call, it will do the best it can for + // invalid input, which is what we want here. + WideToUTF16(wide.data(), wide.length(), &ret); + return ret; +} + +bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { + return UTFConversion(StringPiece16(src, src_len), output); +} + +std::wstring UTF16ToWide(StringPiece16 utf16) { + std::wstring ret; + // Ignore the success flag of this call, it will do the best it can for + // invalid input, which is what we want here. + UTF16ToWide(utf16.data(), utf16.length(), &ret); + return ret; +} + +#endif // defined(WCHAR_T_IS_UTF32) + +// UTF-8 <-> Wide -------------------------------------------------------------- + +// UTF8ToWide is the same code, regardless of whether wide is 16 or 32 bits + +bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { + return UTFConversion(StringPiece(src, src_len), output); +} + +std::wstring UTF8ToWide(StringPiece utf8) { + std::wstring ret; + // Ignore the success flag of this call, it will do the best it can for + // invalid input, which is what we want here. + UTF8ToWide(utf8.data(), utf8.length(), &ret); + return ret; +} + +#if defined(WCHAR_T_IS_UTF16) +// Easy case since we can use the "utf" versions we already wrote above. + +bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { + return UTF16ToUTF8(as_u16cstr(src), src_len, output); +} + +std::string WideToUTF8(WStringPiece wide) { + return UTF16ToUTF8(StringPiece16(as_u16cstr(wide), wide.size())); +} + +#elif defined(WCHAR_T_IS_UTF32) + +bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { + return UTFConversion(WStringPiece(src, src_len), output); +} + +std::string WideToUTF8(WStringPiece wide) { + std::string ret; + // Ignore the success flag of this call, it will do the best it can for + // invalid input, which is what we want here. + WideToUTF8(wide.data(), wide.length(), &ret); + return ret; +} + +#endif // defined(WCHAR_T_IS_UTF32) + +string16 ASCIIToUTF16(StringPiece ascii) { + GURL_DCHECK(IsStringASCII(ascii)) << ascii; + return string16(ascii.begin(), ascii.end()); +} + +std::string UTF16ToASCII(StringPiece16 utf16) { + GURL_DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16); + return std::string(utf16.begin(), utf16.end()); +} + +} // namespace base
diff --git a/base/strings/utf_string_conversions.h b/base/strings/utf_string_conversions.h new file mode 100644 index 0000000..e64f420 --- /dev/null +++ b/base/strings/utf_string_conversions.h
@@ -0,0 +1,54 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_UTF_STRING_CONVERSIONS_H_ +#define BASE_STRINGS_UTF_STRING_CONVERSIONS_H_ + +#include <stddef.h> + +#include <string> + +#include "polyfills/base/base_export.h" +#include "base/strings/string16.h" +#include "base/strings/string_piece.h" + +namespace gurl_base { + +// These convert between UTF-8, -16, and -32 strings. They are potentially slow, +// so avoid unnecessary conversions. The low-level versions return a boolean +// indicating whether the conversion was 100% valid. In this case, it will still +// do the best it can and put the result in the output buffer. The versions that +// return strings ignore this error and just return the best conversion +// possible. +BASE_EXPORT bool WideToUTF8(const wchar_t* src, size_t src_len, + std::string* output); +BASE_EXPORT std::string WideToUTF8(WStringPiece wide); +BASE_EXPORT bool UTF8ToWide(const char* src, size_t src_len, + std::wstring* output); +BASE_EXPORT std::wstring UTF8ToWide(StringPiece utf8); + +BASE_EXPORT bool WideToUTF16(const wchar_t* src, size_t src_len, + string16* output); +BASE_EXPORT string16 WideToUTF16(WStringPiece wide); +BASE_EXPORT bool UTF16ToWide(const char16* src, size_t src_len, + std::wstring* output); +BASE_EXPORT std::wstring UTF16ToWide(StringPiece16 utf16); + +BASE_EXPORT bool UTF8ToUTF16(const char* src, size_t src_len, string16* output); +BASE_EXPORT string16 UTF8ToUTF16(StringPiece utf8); +BASE_EXPORT bool UTF16ToUTF8(const char16* src, size_t src_len, + std::string* output); +BASE_EXPORT std::string UTF16ToUTF8(StringPiece16 utf16); + +// This converts an ASCII string, typically a hardcoded constant, to a UTF16 +// string. +BASE_EXPORT string16 ASCIIToUTF16(StringPiece ascii); + +// Converts to 7-bit ASCII by truncating. The result must be known to be ASCII +// beforehand. +BASE_EXPORT std::string UTF16ToASCII(StringPiece16 utf16); + +} // namespace base + +#endif // BASE_STRINGS_UTF_STRING_CONVERSIONS_H_
diff --git a/base/strings/utf_string_conversions_fuzzer.cc b/base/strings/utf_string_conversions_fuzzer.cc new file mode 100644 index 0000000..96bccda --- /dev/null +++ b/base/strings/utf_string_conversions_fuzzer.cc
@@ -0,0 +1,56 @@ +// Copyright 2018 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string_util.h" +#include "base/strings/utf_string_conversions.h" + +std::string output_std_string; +std::wstring output_std_wstring; +gurl_base::string16 output_string16; + +// Entry point for LibFuzzer. +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + gurl_base::StringPiece string_piece_input(reinterpret_cast<const char*>(data), + size); + + gurl_base::UTF8ToWide(string_piece_input); + gurl_base::UTF8ToWide(reinterpret_cast<const char*>(data), size, + &output_std_wstring); + gurl_base::UTF8ToUTF16(string_piece_input); + gurl_base::UTF8ToUTF16(reinterpret_cast<const char*>(data), size, + &output_string16); + + // Test for char16. + if (size % 2 == 0) { + gurl_base::StringPiece16 string_piece_input16( + reinterpret_cast<const gurl_base::char16*>(data), size / 2); + gurl_base::UTF16ToWide(output_string16); + gurl_base::UTF16ToWide(reinterpret_cast<const gurl_base::char16*>(data), size / 2, + &output_std_wstring); + gurl_base::UTF16ToUTF8(string_piece_input16); + gurl_base::UTF16ToUTF8(reinterpret_cast<const gurl_base::char16*>(data), size / 2, + &output_std_string); + } + + // Test for wchar_t. + size_t wchar_t_size = sizeof(wchar_t); + if (size % wchar_t_size == 0) { + gurl_base::WideToUTF8(output_std_wstring); + gurl_base::WideToUTF8(reinterpret_cast<const wchar_t*>(data), + size / wchar_t_size, &output_std_string); + gurl_base::WideToUTF16(output_std_wstring); + gurl_base::WideToUTF16(reinterpret_cast<const wchar_t*>(data), + size / wchar_t_size, &output_string16); + } + + // Test for ASCII. This condition is needed to avoid hitting instant GURL_CHECK + // failures. + if (gurl_base::IsStringASCII(string_piece_input)) { + output_string16 = gurl_base::ASCIIToUTF16(string_piece_input); + gurl_base::StringPiece16 string_piece_input16(output_string16); + gurl_base::UTF16ToASCII(string_piece_input16); + } + + return 0; +}
diff --git a/base/strings/utf_string_conversions_unittest.cc b/base/strings/utf_string_conversions_unittest.cc new file mode 100644 index 0000000..22b167b --- /dev/null +++ b/base/strings/utf_string_conversions_unittest.cc
@@ -0,0 +1,208 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <stddef.h> + +#include "polyfills/base/logging.h" +#include "base/stl_util.h" +#include "base/strings/string_piece.h" +#include "base/strings/string_util.h" +#include "base/strings/utf_string_conversions.h" +#include "build/build_config.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace gurl_base { + +namespace { + +const wchar_t* const kConvertRoundtripCases[] = { + L"Google Video", + // "网页 图片 资讯更多 »" + L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", + // "Παγκόσμιος Ιστός" + L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" + L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", + // "Поиск страниц на русском" + L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" + L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" + L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", + // "전체서비스" + L"\xc804\xccb4\xc11c\xbe44\xc2a4", + + // Test characters that take more than 16 bits. This will depend on whether + // wchar_t is 16 or 32 bits. +#if defined(WCHAR_T_IS_UTF16) + L"\xd800\xdf00", + // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) + L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", +#elif defined(WCHAR_T_IS_UTF32) + L"\x10300", + // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) + L"\x11d40\x11d41\x11d42\x11d43\x11d44", +#endif +}; + +} // namespace + +TEST(UTFStringConversionsTest, ConvertUTF8AndWide) { + // we round-trip all the wide strings through UTF-8 to make sure everything + // agrees on the conversion. This uses the stream operators to test them + // simultaneously. + for (auto* i : kConvertRoundtripCases) { + std::ostringstream utf8; + utf8 << WideToUTF8(i); + std::wostringstream wide; + wide << UTF8ToWide(utf8.str()); + + EXPECT_EQ(i, wide.str()); + } +} + +TEST(UTFStringConversionsTest, ConvertUTF8AndWideEmptyString) { + // An empty std::wstring should be converted to an empty std::string, + // and vice versa. + std::wstring wempty; + std::string empty; + EXPECT_EQ(empty, WideToUTF8(wempty)); + EXPECT_EQ(wempty, UTF8ToWide(empty)); +} + +TEST(UTFStringConversionsTest, ConvertUTF8ToWide) { + struct UTF8ToWideCase { + const char* utf8; + const wchar_t* wide; + bool success; + } convert_cases[] = { + // Regular UTF-8 input. + {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, + // Non-character is passed through. + {"\xef\xbf\xbfHello", L"\xffffHello", true}, + // Truncated UTF-8 sequence. + {"\xe4\xa0\xe5\xa5\xbd", L"\xfffd\x597d", false}, + // Truncated off the end. + {"\xe5\xa5\xbd\xe4\xa0", L"\x597d\xfffd", false}, + // Non-shortest-form UTF-8. + {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\xfffd\xfffd\xfffd\xfffd\x597d", false}, + // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. + {"\xed\xb0\x80", L"\xfffd\xfffd\xfffd", false}, + // Non-BMP characters. The second is a non-character regarded as valid. + // The result will either be in UTF-16 or UTF-32. +#if defined(WCHAR_T_IS_UTF16) + {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, + {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true}, +#elif defined(WCHAR_T_IS_UTF32) + {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, + {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true}, +#endif + }; + + for (const auto& i : convert_cases) { + std::wstring converted; + EXPECT_EQ(i.success, UTF8ToWide(i.utf8, strlen(i.utf8), &converted)); + std::wstring expected(i.wide); + EXPECT_EQ(expected, converted); + } + + // Manually test an embedded NULL. + std::wstring converted; + EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted)); + ASSERT_EQ(3U, converted.length()); + EXPECT_EQ(static_cast<wchar_t>(0), converted[0]); + EXPECT_EQ('Z', converted[1]); + EXPECT_EQ('\t', converted[2]); + + // Make sure that conversion replaces, not appends. + EXPECT_TRUE(UTF8ToWide("B", 1, &converted)); + ASSERT_EQ(1U, converted.length()); + EXPECT_EQ('B', converted[0]); +} + +#if defined(WCHAR_T_IS_UTF16) +// This test is only valid when wchar_t == UTF-16. +TEST(UTFStringConversionsTest, ConvertUTF16ToUTF8) { + struct WideToUTF8Case { + const wchar_t* utf16; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular UTF-16 input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", false}, + // Truncated at the end. + {L"\x597d\xd800", "\xe5\xa5\xbd\xef\xbf\xbd", false}, + }; + + for (const auto& test : convert_cases) { + std::string converted; + EXPECT_EQ(test.success, + WideToUTF8(test.utf16, wcslen(test.utf16), &converted)); + std::string expected(test.utf8); + EXPECT_EQ(expected, converted); + } +} + +#elif defined(WCHAR_T_IS_UTF32) +// This test is only valid when wchar_t == UTF-32. +TEST(UTFStringConversionsTest, ConvertUTF32ToUTF8) { + struct WideToUTF8Case { + const wchar_t* utf32; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular 16-bit input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true}, + // Invalid Unicode code points. + {L"\xfffffffHello", "\xEF\xBF\xBDHello", false}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", false}, + {L"\xdc01Hello", "\xef\xbf\xbdHello", false}, + }; + + for (const auto& test : convert_cases) { + std::string converted; + EXPECT_EQ(test.success, + WideToUTF8(test.utf32, wcslen(test.utf32), &converted)); + std::string expected(test.utf8); + EXPECT_EQ(expected, converted); + } +} +#endif // defined(WCHAR_T_IS_UTF32) + +TEST(UTFStringConversionsTest, ConvertMultiString) { + static char16 multi16[] = { + 'f', 'o', 'o', '\0', + 'b', 'a', 'r', '\0', + 'b', 'a', 'z', '\0', + '\0' + }; + static char multi[] = { + 'f', 'o', 'o', '\0', + 'b', 'a', 'r', '\0', + 'b', 'a', 'z', '\0', + '\0' + }; + string16 multistring16; + memcpy(WriteInto(&multistring16, gurl_base::size(multi16)), multi16, + sizeof(multi16)); + EXPECT_EQ(gurl_base::size(multi16) - 1, multistring16.length()); + std::string expected; + memcpy(WriteInto(&expected, gurl_base::size(multi)), multi, sizeof(multi)); + EXPECT_EQ(gurl_base::size(multi) - 1, expected.length()); + const std::string& converted = UTF16ToUTF8(multistring16); + EXPECT_EQ(gurl_base::size(multi) - 1, converted.length()); + EXPECT_EQ(expected, converted); +} + +} // namespace base
diff --git a/base/template_util.h b/base/template_util.h new file mode 100644 index 0000000..5384355 --- /dev/null +++ b/base/template_util.h
@@ -0,0 +1,188 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_TEMPLATE_UTIL_H_ +#define BASE_TEMPLATE_UTIL_H_ + +#include <stddef.h> +#include <iosfwd> +#include <iterator> +#include <type_traits> +#include <utility> +#include <vector> + +#include "build/build_config.h" + +// Some versions of libstdc++ have partial support for type_traits, but misses +// a smaller subset while removing some of the older non-standard stuff. Assume +// that all versions below 5.0 fall in this category, along with one 5.0 +// experimental release. Test for this by consulting compiler major version, +// the only reliable option available, so theoretically this could fail should +// you attempt to mix an earlier version of libstdc++ with >= GCC5. But +// that's unlikely to work out, especially as GCC5 changed ABI. +#define CR_GLIBCXX_5_0_0 20150123 +#if (defined(__GNUC__) && __GNUC__ < 5) || \ + (defined(__GLIBCXX__) && __GLIBCXX__ == CR_GLIBCXX_5_0_0) +#define CR_USE_FALLBACKS_FOR_OLD_EXPERIMENTAL_GLIBCXX +#endif + +// This hacks around using gcc with libc++ which has some incompatibilies. +// - is_trivially_* doesn't work: https://llvm.org/bugs/show_bug.cgi?id=27538 +// TODO(danakj): Remove this when android builders are all using a newer version +// of gcc, or the android ndk is updated to a newer libc++ that works with older +// gcc versions. +#if !defined(__clang__) && defined(_LIBCPP_VERSION) +#define CR_USE_FALLBACKS_FOR_GCC_WITH_LIBCXX +#endif + +namespace gurl_base { + +template <class T> struct is_non_const_reference : std::false_type {}; +template <class T> struct is_non_const_reference<T&> : std::true_type {}; +template <class T> struct is_non_const_reference<const T&> : std::false_type {}; + +namespace internal { + +// Implementation detail of gurl_base::void_t below. +template <typename...> +struct make_void { + using type = void; +}; + +} // namespace internal + +// gurl_base::void_t is an implementation of std::void_t from C++17. +// +// We use |gurl_base::internal::make_void| as a helper struct to avoid a C++14 +// defect: +// http://en.cppreference.com/w/cpp/types/void_t +// http://open-std.org/JTC1/SC22/WG21/docs/cwg_defects.html#1558 +template <typename... Ts> +using void_t = typename ::gurl_base::internal::make_void<Ts...>::type; + +namespace internal { + +// Uses expression SFINAE to detect whether using operator<< would work. +template <typename T, typename = void> +struct SupportsOstreamOperator : std::false_type {}; +template <typename T> +struct SupportsOstreamOperator<T, + decltype(void(std::declval<std::ostream&>() + << std::declval<T>()))> + : std::true_type {}; + +template <typename T, typename = void> +struct SupportsToString : std::false_type {}; +template <typename T> +struct SupportsToString<T, decltype(void(std::declval<T>().ToString()))> + : std::true_type {}; + +// Used to detech whether the given type is an iterator. This is normally used +// with std::enable_if to provide disambiguation for functions that take +// templatzed iterators as input. +template <typename T, typename = void> +struct is_iterator : std::false_type {}; + +template <typename T> +struct is_iterator<T, + void_t<typename std::iterator_traits<T>::iterator_category>> + : std::true_type {}; + +} // namespace internal + +// is_trivially_copyable is especially hard to get right. +// - Older versions of libstdc++ will fail to have it like they do for other +// type traits. This has become a subset of the second point, but used to be +// handled independently. +// - An experimental release of gcc includes most of type_traits but misses +// is_trivially_copyable, so we still have to avoid using libstdc++ in this +// case, which is covered by CR_USE_FALLBACKS_FOR_OLD_EXPERIMENTAL_GLIBCXX. +// - When compiling libc++ from before r239653, with a gcc compiler, the +// std::is_trivially_copyable can fail. So we need to work around that by not +// using the one in libc++ in this case. This is covered by the +// CR_USE_FALLBACKS_FOR_GCC_WITH_LIBCXX define, and is discussed in +// https://llvm.org/bugs/show_bug.cgi?id=27538#c1 where they point out that +// in libc++'s commit r239653 this is fixed by libc++ checking for gcc 5.1. +// - In both of the above cases we are using the gcc compiler. When defining +// this ourselves on compiler intrinsics, the __is_trivially_copyable() +// intrinsic is not available on gcc before version 5.1 (see the discussion in +// https://llvm.org/bugs/show_bug.cgi?id=27538#c1 again), so we must check for +// that version. +// - When __is_trivially_copyable() is not available because we are on gcc older +// than 5.1, we need to fall back to something, so we use __has_trivial_copy() +// instead based on what was done one-off in bit_cast() previously. + +// TODO(crbug.com/554293): Remove this when all platforms have this in the std +// namespace and it works with gcc as needed. +#if defined(CR_USE_FALLBACKS_FOR_OLD_EXPERIMENTAL_GLIBCXX) || \ + defined(CR_USE_FALLBACKS_FOR_GCC_WITH_LIBCXX) +template <typename T> +struct is_trivially_copyable { +// TODO(danakj): Remove this when android builders are all using a newer version +// of gcc, or the android ndk is updated to a newer libc++ that does this for +// us. +#if _GNUC_VER >= 501 + static constexpr bool value = __is_trivially_copyable(T); +#else + static constexpr bool value = + __has_trivial_copy(T) && __has_trivial_destructor(T); +#endif +}; +#else +template <class T> +using is_trivially_copyable = std::is_trivially_copyable<T>; +#endif + +#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ <= 7 +// Workaround for g++7 and earlier family. +// Due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80654, without this +// Optional<std::vector<T>> where T is non-copyable causes a compile error. +// As we know it is not trivially copy constructible, explicitly declare so. +template <typename T> +struct is_trivially_copy_constructible + : std::is_trivially_copy_constructible<T> {}; + +template <typename... T> +struct is_trivially_copy_constructible<std::vector<T...>> : std::false_type {}; +#else +// Otherwise use std::is_trivially_copy_constructible as is. +template <typename T> +using is_trivially_copy_constructible = std::is_trivially_copy_constructible<T>; +#endif + +// gurl_base::in_place_t is an implementation of std::in_place_t from +// C++17. A tag type used to request in-place construction in template vararg +// constructors. + +// Specification: +// https://en.cppreference.com/w/cpp/utility/in_place +struct in_place_t {}; +constexpr in_place_t in_place = {}; + +// gurl_base::in_place_type_t is an implementation of std::in_place_type_t from +// C++17. A tag type used for in-place construction when the type to construct +// needs to be specified, such as with gurl_base::unique_any, designed to be a +// drop-in replacement. + +// Specification: +// http://en.cppreference.com/w/cpp/utility/in_place +template <typename T> +struct in_place_type_t {}; + +template <typename T> +struct is_in_place_type_t { + static constexpr bool value = false; +}; + +template <typename... Ts> +struct is_in_place_type_t<in_place_type_t<Ts...>> { + static constexpr bool value = true; +}; + +} // namespace base + +#undef CR_USE_FALLBACKS_FOR_GCC_WITH_LIBCXX +#undef CR_USE_FALLBACKS_FOR_OLD_EXPERIMENTAL_GLIBCXX + +#endif // BASE_TEMPLATE_UTIL_H_
diff --git a/base/third_party/icu/BUILD b/base/third_party/icu/BUILD new file mode 100644 index 0000000..97a033e --- /dev/null +++ b/base/third_party/icu/BUILD
@@ -0,0 +1,10 @@ +# Copyright 2019 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +cc_library( + name = "icu", + srcs = ["icu_utf.cc"], + hdrs = ["icu_utf.h"], + visibility = ["//visibility:public"], +)
diff --git a/base/third_party/icu/LICENSE b/base/third_party/icu/LICENSE new file mode 100644 index 0000000..2882e4e --- /dev/null +++ b/base/third_party/icu/LICENSE
@@ -0,0 +1,76 @@ +COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later) + +Copyright © 1991-2017 Unicode, Inc. All rights reserved. +Distributed under the Terms of Use in http://www.unicode.org/copyright.html + +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Unicode data files and any associated documentation +(the "Data Files") or Unicode software and any associated documentation +(the "Software") to deal in the Data Files or Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of +the Data Files or Software, and to permit persons to whom the Data Files +or Software are furnished to do so, provided that either +(a) this copyright and permission notice appear with all copies +of the Data Files or Software, or +(b) this copyright and permission notice appear in associated +Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT OF THIRD PARTY RIGHTS. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS +NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL +DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THE DATA FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, +use or other dealings in these Data Files or Software without prior +written authorization of the copyright holder. + +--------------------- + +Third-Party Software Licenses + +This section contains third-party software notices and/or additional +terms for licensed third-party software components included within ICU +libraries. + +1. ICU License - ICU 1.8.1 to ICU 57.1 + +COPYRIGHT AND PERMISSION NOTICE + +Copyright (c) 1995-2016 International Business Machines Corporation and others +All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, and/or sell copies of the Software, and to permit persons +to whom the Software is furnished to do so, provided that the above +copyright notice(s) and this permission notice appear in all copies of +the Software and that both the above copyright notice(s) and this +permission notice appear in supporting documentation. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY +SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER +RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF +CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, use +or other dealings in this Software without prior written authorization +of the copyright holder. + +All trademarks and registered trademarks mentioned herein are the +property of their respective owners.
diff --git a/base/third_party/icu/README.chromium b/base/third_party/icu/README.chromium new file mode 100644 index 0000000..297e89a --- /dev/null +++ b/base/third_party/icu/README.chromium
@@ -0,0 +1,17 @@ +Name: ICU +URL: http://site.icu-project.org/ +Version: 60 +License: Unicode +License File: NOT_SHIPPED + +This file has the relevant components from ICU copied to handle basic UTF8/16/32 +conversions. Components are copied from umachine.h, utf.h, utf8.h, and utf16.h +into icu_utf.h, and from utf_impl.cpp into icu_utf.cc. + +The main change is that U_/U8_/U16_ prefixes have been replaced with +CBU_/CBU8_/CBU16_ (for "Chrome Base") to avoid confusion with the "real" ICU +macros should ICU be in use on the system. For the same reason, the functions +and types have been put in the "base_icu" namespace. + +Note that this license file is marked as NOT_SHIPPED, since a more complete +ICU license is included from //third_party/icu/README.chromium
diff --git a/base/third_party/icu/icu_utf.cc b/base/third_party/icu/icu_utf.cc new file mode 100644 index 0000000..a3262b0 --- /dev/null +++ b/base/third_party/icu/icu_utf.cc
@@ -0,0 +1,131 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +****************************************************************************** +* +* Copyright (C) 1999-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: utf_impl.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999sep13 +* created by: Markus W. Scherer +* +* This file provides implementation functions for macros in the utfXX.h +* that would otherwise be too long as macros. +*/ + +#include "base/third_party/icu/icu_utf.h" + +namespace base_icu { + +// source/common/utf_impl.cpp + +static const UChar32 +utf8_errorValue[6]={ + // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, + // but without relying on the obsolete unicode/utf_old.h. + 0x15, 0x9f, 0xffff, + 0x10ffff +}; + +static UChar32 +errorValue(int32_t count, int8_t strict) { + if(strict>=0) { + return utf8_errorValue[count]; + } else if(strict==-3) { + return 0xfffd; + } else { + return CBU_SENTINEL; + } +} + +/* + * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros + * and their obsolete sibling UTF8_NEXT_CHAR_SAFE(). + * + * U8_NEXT() supports NUL-terminated strings indicated via length<0. + * + * The "strict" parameter controls the error behavior: + * <0 "Safe" behavior of U8_NEXT(): + * -1: All illegal byte sequences yield U_SENTINEL=-1. + * -2: Same as -1, except for lenient treatment of surrogate code points as legal. + * Some implementations use this for roundtripping of + * Unicode 16-bit strings that are not well-formed UTF-16, that is, they + * contain unpaired surrogates. + * -3: All illegal byte sequences yield U+FFFD. + * 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE): + * All illegal byte sequences yield a positive code point such that this + * result code point would be encoded with the same number of bytes as + * the illegal sequence. + * >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE): + * Same as the obsolete "safe" behavior, but non-characters are also treated + * like illegal sequences. + * + * Note that a UBool is the same as an int8_t. + */ +UChar32 +utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) { + // *pi is one after byte c. + int32_t i=*pi; + // length can be negative for NUL-terminated strings: Read and validate one byte at a time. + if(i==length || c>0xf4) { + // end of string, or not a lead byte + } else if(c>=0xf0) { + // Test for 4-byte sequences first because + // U8_NEXT() handles shorter valid sequences inline. + uint8_t t1=s[i], t2, t3; + c&=7; + if(CBU8_IS_VALID_LEAD4_AND_T1(c, t1) && + ++i!=length && (t2=s[i]-0x80)<=0x3f && + ++i!=length && (t3=s[i]-0x80)<=0x3f) { + ++i; + c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3; + // strict: forbid non-characters like U+fffe + if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) { + *pi=i; + return c; + } + } + } else if(c>=0xe0) { + c&=0xf; + if(strict!=-2) { + uint8_t t1=s[i], t2; + if(CBU8_IS_VALID_LEAD3_AND_T1(c, t1) && + ++i!=length && (t2=s[i]-0x80)<=0x3f) { + ++i; + c=(c<<12)|((t1&0x3f)<<6)|t2; + // strict: forbid non-characters like U+fffe + if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) { + *pi=i; + return c; + } + } + } else { + // strict=-2 -> lenient: allow surrogates + uint8_t t1=s[i]-0x80, t2; + if(t1<=0x3f && (c>0 || t1>=0x20) && + ++i!=length && (t2=s[i]-0x80)<=0x3f) { + *pi=i+1; + return (c<<12)|(t1<<6)|t2; + } + } + } else if(c>=0xc2) { + uint8_t t1=s[i]-0x80; + if(t1<=0x3f) { + *pi=i+1; + return ((c-0xc0)<<6)|t1; + } + } // else 0x80<=c<0xc2 is not a lead byte + + /* error handling */ + c=errorValue(i-*pi, strict); + *pi=i; + return c; +} + +} // namespace base_icu
diff --git a/base/third_party/icu/icu_utf.h b/base/third_party/icu/icu_utf.h new file mode 100644 index 0000000..2ba8231 --- /dev/null +++ b/base/third_party/icu/icu_utf.h
@@ -0,0 +1,442 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +****************************************************************************** +* +* Copyright (C) 1999-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +*/ + +#ifndef BASE_THIRD_PARTY_ICU_ICU_UTF_H_ +#define BASE_THIRD_PARTY_ICU_ICU_UTF_H_ + +#include <stdint.h> + +namespace base_icu { + +// source/common/unicode/umachine.h + +/** The ICU boolean type @stable ICU 2.0 */ +typedef int8_t UBool; + +/** + * Define UChar32 as a type for single Unicode code points. + * UChar32 is a signed 32-bit integer (same as int32_t). + * + * The Unicode code point range is 0..0x10ffff. + * All other values (negative or >=0x110000) are illegal as Unicode code points. + * They may be used as sentinel values to indicate "done", "error" + * or similar non-code point conditions. + * + * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined + * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned) + * or else to be uint32_t. + * That is, the definition of UChar32 was platform-dependent. + * + * @see U_SENTINEL + * @stable ICU 2.4 + */ +typedef int32_t UChar32; + +/** + * This value is intended for sentinel values for APIs that + * (take or) return single code points (UChar32). + * It is outside of the Unicode code point range 0..0x10ffff. + * + * For example, a "done" or "error" value in a new API + * could be indicated with U_SENTINEL. + * + * ICU APIs designed before ICU 2.4 usually define service-specific "done" + * values, mostly 0xffff. + * Those may need to be distinguished from + * actual U+ffff text contents by calling functions like + * CharacterIterator::hasNext() or UnicodeString::length(). + * + * @return -1 + * @see UChar32 + * @stable ICU 2.4 + */ +#define CBU_SENTINEL (-1) + +// source/common/unicode/utf.h + +/** + * Is this code point a Unicode noncharacter? + * @param c 32-bit code point + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define CBU_IS_UNICODE_NONCHAR(c) \ + ((c)>=0xfdd0 && \ + ((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff) + +/** + * Is c a Unicode code point value (0..U+10ffff) + * that can be assigned a character? + * + * Code points that are not characters include: + * - single surrogate code points (U+d800..U+dfff, 2048 code points) + * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points) + * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points) + * - the highest Unicode code point value is U+10ffff + * + * This means that all code points below U+d800 are character code points, + * and that boundary is tested first for performance. + * + * @param c 32-bit code point + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define CBU_IS_UNICODE_CHAR(c) \ + ((uint32_t)(c)<0xd800 || \ + (0xdfff<(c) && (c)<=0x10ffff && !CBU_IS_UNICODE_NONCHAR(c))) + +/** + * Is this code point a surrogate (U+d800..U+dfff)? + * @param c 32-bit code point + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define CBU_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800) + +/** + * Assuming c is a surrogate code point (U_IS_SURROGATE(c)), + * is it a lead surrogate? + * @param c 32-bit code point + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define CBU_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) + +// source/common/unicode/utf8.h + +/** + * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1. + * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. + * Lead byte E0..EF bits 3..0 are used as byte index, + * first trail byte bits 7..5 are used as bit index into that byte. + * @see U8_IS_VALID_LEAD3_AND_T1 + * @internal + */ +#define CBU8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30" + +/** + * Internal 3-byte UTF-8 validity check. + * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence. + * @internal + */ +#define CBU8_IS_VALID_LEAD3_AND_T1(lead, t1) (CBU8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5))) + +/** + * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1. + * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. + * First trail byte bits 7..4 are used as byte index, + * lead byte F0..F4 bits 2..0 are used as bit index into that byte. + * @see U8_IS_VALID_LEAD4_AND_T1 + * @internal + */ +#define CBU8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00" + +/** + * Internal 4-byte UTF-8 validity check. + * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence. + * @internal + */ +#define CBU8_IS_VALID_LEAD4_AND_T1(lead, t1) (CBU8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7))) + +/** + * Function for handling "next code point" with error-checking. + * + * This is internal since it is not meant to be called directly by external clie +nts; + * however it is U_STABLE (not U_INTERNAL) since it is called by public macros i +n this + * file and thus must remain stable, and should not be hidden when other interna +l + * functions are hidden (otherwise public macros would fail to compile). + * @internal + */ +UChar32 +utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu::UChar32 c, ::base_icu::UBool strict); + +/** + * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? + * @param c 8-bit code unit (byte) + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define CBU8_IS_SINGLE(c) (((c)&0x80)==0) + +/** + * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4) + * @param c 8-bit code unit (byte) + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define CBU8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32) + +/** + * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF) + * @param c 8-bit code unit (byte) + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define CBU8_IS_TRAIL(c) ((int8_t)(c)<-0x40) + +/** + * How many code units (bytes) are used for the UTF-8 encoding + * of this Unicode code point? + * @param c 32-bit code point + * @return 1..4, or 0 if c is a surrogate or not a Unicode code point + * @stable ICU 2.4 + */ +#define CBU8_LENGTH(c) \ + ((uint32_t)(c)<=0x7f ? 1 : \ + ((uint32_t)(c)<=0x7ff ? 2 : \ + ((uint32_t)(c)<=0xd7ff ? 3 : \ + ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ + ((uint32_t)(c)<=0xffff ? 3 : 4)\ + ) \ + ) \ + ) \ + ) + +/** + * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). + * @return 4 + * @stable ICU 2.4 + */ +#define CBU8_MAX_LENGTH 4 + +/** + * Get a code point from a string at a code point boundary offset, + * and advance the offset to the next code point boundary. + * (Post-incrementing forward iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * The offset may point to the lead byte of a multi-byte sequence, + * in which case the macro will read the whole sequence. + * If the offset points to a trail byte or an illegal UTF-8 sequence, then + * c is set to a negative value. + * + * @param s const uint8_t * string + * @param i int32_t string offset, must be i<length + * @param length int32_t string length + * @param c output UChar32 variable, set to <0 in case of an error + * @see U8_NEXT_UNSAFE + * @stable ICU 2.4 + */ +#define CBU8_NEXT(s, i, length, c) { \ + (c)=(uint8_t)(s)[(i)++]; \ + if(!CBU8_IS_SINGLE(c)) { \ + uint8_t __t1, __t2; \ + if( /* handle U+0800..U+FFFF inline */ \ + (0xe0<=(c) && (c)<0xf0) && \ + (((i)+1)<(length) || (length)<0) && \ + CBU8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \ + (__t2=(s)[(i)+1]-0x80)<=0x3f) { \ + (c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \ + (i)+=2; \ + } else if( /* handle U+0080..U+07FF inline */ \ + ((c)<0xe0 && (c)>=0xc2) && \ + ((i)!=(length)) && \ + (__t1=(s)[i]-0x80)<=0x3f) { \ + (c)=(((c)&0x1f)<<6)|__t1; \ + ++(i); \ + } else { \ + /* function call for "complicated" and error cases */ \ + (c)=::base_icu::utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1); \ + } \ + } \ +} + +/** + * Append a code point to a string, overwriting 1 to 4 bytes. + * The offset points to the current end of the string contents + * and is advanced (post-increment). + * "Unsafe" macro, assumes a valid code point and sufficient space in the string. + * Otherwise, the result is undefined. + * + * @param s const uint8_t * string buffer + * @param i string offset + * @param c code point to append + * @see U8_APPEND + * @stable ICU 2.4 + */ +#define CBU8_APPEND_UNSAFE(s, i, c) { \ + if((uint32_t)(c)<=0x7f) { \ + (s)[(i)++]=(uint8_t)(c); \ + } else { \ + if((uint32_t)(c)<=0x7ff) { \ + (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ + } else { \ + if((uint32_t)(c)<=0xffff) { \ + (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ + } else { \ + (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ + (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \ + } \ + (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ + } \ + (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ + } \ +} + +// source/common/unicode/utf16.h + +/** + * Does this code unit alone encode a code point (BMP, not a surrogate)? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define CBU16_IS_SINGLE(c) !CBU_IS_SURROGATE(c) + +/** + * Is this code unit a lead surrogate (U+d800..U+dbff)? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define CBU16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) + +/** + * Is this code unit a trail surrogate (U+dc00..U+dfff)? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define CBU16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) + +/** + * Is this code unit a surrogate (U+d800..U+dfff)? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define CBU16_IS_SURROGATE(c) CBU_IS_SURROGATE(c) + +/** + * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), + * is it a lead surrogate? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define CBU16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) + +/** + * Helper constant for U16_GET_SUPPLEMENTARY. + * @internal + */ +#define CBU16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) + +/** + * Get a supplementary code point value (U+10000..U+10ffff) + * from its lead and trail surrogates. + * The result is undefined if the input values are not + * lead and trail surrogates. + * + * @param lead lead surrogate (U+d800..U+dbff) + * @param trail trail surrogate (U+dc00..U+dfff) + * @return supplementary code point (U+10000..U+10ffff) + * @stable ICU 2.4 + */ +#define CBU16_GET_SUPPLEMENTARY(lead, trail) \ + (((::base_icu::UChar32)(lead)<<10UL)+(::base_icu::UChar32)(trail)-CBU16_SURROGATE_OFFSET) + +/** + * Get the lead surrogate (0xd800..0xdbff) for a + * supplementary code point (0x10000..0x10ffff). + * @param supplementary 32-bit code point (U+10000..U+10ffff) + * @return lead surrogate (U+d800..U+dbff) for supplementary + * @stable ICU 2.4 + */ +#define CBU16_LEAD(supplementary) (::base_icu::UChar)(((supplementary)>>10)+0xd7c0) + +/** + * Get the trail surrogate (0xdc00..0xdfff) for a + * supplementary code point (0x10000..0x10ffff). + * @param supplementary 32-bit code point (U+10000..U+10ffff) + * @return trail surrogate (U+dc00..U+dfff) for supplementary + * @stable ICU 2.4 + */ +#define CBU16_TRAIL(supplementary) (::base_icu::UChar)(((supplementary)&0x3ff)|0xdc00) + +/** + * How many 16-bit code units are used to encode this Unicode code point? (1 or 2) + * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff). + * @param c 32-bit code point + * @return 1 or 2 + * @stable ICU 2.4 + */ +#define CBU16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) + +/** + * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff). + * @return 2 + * @stable ICU 2.4 + */ +#define CBU16_MAX_LENGTH 2 + +/** + * Get a code point from a string at a code point boundary offset, + * and advance the offset to the next code point boundary. + * (Post-incrementing forward iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * The offset may point to the lead surrogate unit + * for a supplementary code point, in which case the macro will read + * the following trail surrogate as well. + * If the offset points to a trail surrogate or + * to a single, unpaired lead surrogate, then c is set to that unpaired surrogate. + * + * @param s const UChar * string + * @param i string offset, must be i<length + * @param length string length + * @param c output UChar32 variable + * @see U16_NEXT_UNSAFE + * @stable ICU 2.4 + */ +#define CBU16_NEXT(s, i, length, c) { \ + (c)=(s)[(i)++]; \ + if(CBU16_IS_LEAD(c)) { \ + uint16_t __c2; \ + if((i)!=(length) && CBU16_IS_TRAIL(__c2=(s)[(i)])) { \ + ++(i); \ + (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \ + } \ + } \ +} + +/** + * Append a code point to a string, overwriting 1 or 2 code units. + * The offset points to the current end of the string contents + * and is advanced (post-increment). + * "Unsafe" macro, assumes a valid code point and sufficient space in the string. + * Otherwise, the result is undefined. + * + * @param s const UChar * string buffer + * @param i string offset + * @param c code point to append + * @see U16_APPEND + * @stable ICU 2.4 + */ +#define CBU16_APPEND_UNSAFE(s, i, c) { \ + if((uint32_t)(c)<=0xffff) { \ + (s)[(i)++]=(uint16_t)(c); \ + } else { \ + (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ + (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ + } \ +} + +} // namesapce base_icu + +#endif // BASE_THIRD_PARTY_ICU_ICU_UTF_H_
diff --git a/build/BUILD b/build/BUILD new file mode 100644 index 0000000..f057fe5 --- /dev/null +++ b/build/BUILD
@@ -0,0 +1,9 @@ +# Copyright 2019 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +cc_library( + name = "build_config", + hdrs = ["build_config.h"], + visibility = ["//visibility:public"], +)
diff --git a/build/build_config.h b/build/build_config.h new file mode 100644 index 0000000..0d87d80 --- /dev/null +++ b/build/build_config.h
@@ -0,0 +1,201 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// This file adds defines about the platform we're currently building on. +// Operating System: +// OS_WIN / OS_MACOSX / OS_LINUX / OS_POSIX (MACOSX or LINUX) / +// OS_NACL (NACL_SFI or NACL_NONSFI) / OS_NACL_SFI / OS_NACL_NONSFI +// OS_CHROMEOS is set by the build system +// Compiler: +// COMPILER_MSVC / COMPILER_GCC +// Processor: +// ARCH_CPU_X86 / ARCH_CPU_X86_64 / ARCH_CPU_X86_FAMILY (X86 or X86_64) +// ARCH_CPU_32_BITS / ARCH_CPU_64_BITS + +#ifndef BUILD_BUILD_CONFIG_H_ +#define BUILD_BUILD_CONFIG_H_ + +// A set of macros to use for platform detection. +#if defined(__native_client__) +// __native_client__ must be first, so that other OS_ defines are not set. +#define OS_NACL 1 +// OS_NACL comes in two sandboxing technology flavors, SFI or Non-SFI. +// PNaCl toolchain defines __native_client_nonsfi__ macro in Non-SFI build +// mode, while it does not in SFI build mode. +#if defined(__native_client_nonsfi__) +#define OS_NACL_NONSFI +#else +#define OS_NACL_SFI +#endif +#elif defined(ANDROID) +#define OS_ANDROID 1 +#elif defined(__APPLE__) +// only include TargetConditions after testing ANDROID as some android builds +// on mac don't have this header available and it's not needed unless the target +// is really mac/ios. +#include <TargetConditionals.h> +#define OS_MACOSX 1 +#if defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE +#define OS_IOS 1 +#endif // defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE +#elif defined(__linux__) +#define OS_LINUX 1 +// include a system header to pull in features.h for glibc/uclibc macros. +#include <unistd.h> +#if defined(__GLIBC__) && !defined(__UCLIBC__) +// we really are using glibc, not uClibc pretending to be glibc +#define LIBC_GLIBC 1 +#endif +#elif defined(_WIN32) +#define OS_WIN 1 +#elif defined(__Fuchsia__) +#define OS_FUCHSIA 1 +#elif defined(__FreeBSD__) +#define OS_FREEBSD 1 +#elif defined(__NetBSD__) +#define OS_NETBSD 1 +#elif defined(__OpenBSD__) +#define OS_OPENBSD 1 +#elif defined(__sun) +#define OS_SOLARIS 1 +#elif defined(__QNXNTO__) +#define OS_QNX 1 +#elif defined(_AIX) +#define OS_AIX 1 +#elif defined(__asmjs__) +#define OS_ASMJS +#else +#error Please add support for your platform in build/build_config.h +#endif +// NOTE: Adding a new port? Please follow +// https://chromium.googlesource.com/chromium/src/+/master/docs/new_port_policy.md + +// For access to standard BSD features, use OS_BSD instead of a +// more specific macro. +#if defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) +#define OS_BSD 1 +#endif + +// For access to standard POSIXish features, use OS_POSIX instead of a +// more specific macro. +#if defined(OS_AIX) || defined(OS_ANDROID) || defined(OS_ASMJS) || \ + defined(OS_FREEBSD) || defined(OS_LINUX) || defined(OS_MACOSX) || \ + defined(OS_NACL) || defined(OS_NETBSD) || defined(OS_OPENBSD) || \ + defined(OS_QNX) || defined(OS_SOLARIS) +#define OS_POSIX 1 +#endif + +// Compiler detection. +#if defined(__GNUC__) +#define COMPILER_GCC 1 +#elif defined(_MSC_VER) +#define COMPILER_MSVC 1 +#else +#error Please add support for your compiler in build/build_config.h +#endif + +// Processor architecture detection. For more info on what's defined, see: +// http://msdn.microsoft.com/en-us/library/b0084kay.aspx +// http://www.agner.org/optimize/calling_conventions.pdf +// or with gcc, run: "echo | gcc -E -dM -" +#if defined(_M_X64) || defined(__x86_64__) +#define ARCH_CPU_X86_FAMILY 1 +#define ARCH_CPU_X86_64 1 +#define ARCH_CPU_64_BITS 1 +#define ARCH_CPU_LITTLE_ENDIAN 1 +#elif defined(_M_IX86) || defined(__i386__) +#define ARCH_CPU_X86_FAMILY 1 +#define ARCH_CPU_X86 1 +#define ARCH_CPU_32_BITS 1 +#define ARCH_CPU_LITTLE_ENDIAN 1 +#elif defined(__s390x__) +#define ARCH_CPU_S390_FAMILY 1 +#define ARCH_CPU_S390X 1 +#define ARCH_CPU_64_BITS 1 +#define ARCH_CPU_BIG_ENDIAN 1 +#elif defined(__s390__) +#define ARCH_CPU_S390_FAMILY 1 +#define ARCH_CPU_S390 1 +#define ARCH_CPU_31_BITS 1 +#define ARCH_CPU_BIG_ENDIAN 1 +#elif (defined(__PPC64__) || defined(__PPC__)) && defined(__BIG_ENDIAN__) +#define ARCH_CPU_PPC64_FAMILY 1 +#define ARCH_CPU_PPC64 1 +#define ARCH_CPU_64_BITS 1 +#define ARCH_CPU_BIG_ENDIAN 1 +#elif defined(__PPC64__) +#define ARCH_CPU_PPC64_FAMILY 1 +#define ARCH_CPU_PPC64 1 +#define ARCH_CPU_64_BITS 1 +#define ARCH_CPU_LITTLE_ENDIAN 1 +#elif defined(__ARMEL__) +#define ARCH_CPU_ARM_FAMILY 1 +#define ARCH_CPU_ARMEL 1 +#define ARCH_CPU_32_BITS 1 +#define ARCH_CPU_LITTLE_ENDIAN 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#define ARCH_CPU_ARM_FAMILY 1 +#define ARCH_CPU_ARM64 1 +#define ARCH_CPU_64_BITS 1 +#define ARCH_CPU_LITTLE_ENDIAN 1 +#elif defined(__pnacl__) || defined(__asmjs__) +#define ARCH_CPU_32_BITS 1 +#define ARCH_CPU_LITTLE_ENDIAN 1 +#elif defined(__MIPSEL__) +#if defined(__LP64__) +#define ARCH_CPU_MIPS_FAMILY 1 +#define ARCH_CPU_MIPS64EL 1 +#define ARCH_CPU_64_BITS 1 +#define ARCH_CPU_LITTLE_ENDIAN 1 +#else +#define ARCH_CPU_MIPS_FAMILY 1 +#define ARCH_CPU_MIPSEL 1 +#define ARCH_CPU_32_BITS 1 +#define ARCH_CPU_LITTLE_ENDIAN 1 +#endif +#elif defined(__MIPSEB__) +#if defined(__LP64__) +#define ARCH_CPU_MIPS_FAMILY 1 +#define ARCH_CPU_MIPS64 1 +#define ARCH_CPU_64_BITS 1 +#define ARCH_CPU_BIG_ENDIAN 1 +#else +#define ARCH_CPU_MIPS_FAMILY 1 +#define ARCH_CPU_MIPS 1 +#define ARCH_CPU_32_BITS 1 +#define ARCH_CPU_BIG_ENDIAN 1 +#endif +#else +#error Please add support for your architecture in build/build_config.h +#endif + +// Type detection for wchar_t. +#if defined(OS_WIN) +#define WCHAR_T_IS_UTF16 +#elif defined(OS_FUCHSIA) +#define WCHAR_T_IS_UTF32 +#elif defined(OS_POSIX) && defined(COMPILER_GCC) && defined(__WCHAR_MAX__) && \ + (__WCHAR_MAX__ == 0x7fffffff || __WCHAR_MAX__ == 0xffffffff) +#define WCHAR_T_IS_UTF32 +#elif defined(OS_POSIX) && defined(COMPILER_GCC) && defined(__WCHAR_MAX__) && \ + (__WCHAR_MAX__ == 0x7fff || __WCHAR_MAX__ == 0xffff) +// On Posix, we'll detect short wchar_t, but projects aren't guaranteed to +// compile in this mode (in particular, Chrome doesn't). This is intended for +// other projects using base who manage their own dependencies and make sure +// short wchar works for them. +#define WCHAR_T_IS_UTF16 +#else +#error Please add support for your compiler in build/build_config.h +#endif + +#if defined(OS_ANDROID) +// The compiler thinks std::string::const_iterator and "const char*" are +// equivalent types. +#define STD_STRING_ITERATOR_IS_CHAR_POINTER +// The compiler thinks gurl_base::string16::const_iterator and "char16*" are +// equivalent types. +#define BASE_STRING16_ITERATOR_IS_CHAR16_POINTER +#endif + +#endif // BUILD_BUILD_CONFIG_H_
diff --git a/copy.bara.sky b/copy.bara.sky new file mode 100644 index 0000000..02a615d --- /dev/null +++ b/copy.bara.sky
@@ -0,0 +1,103 @@ +# Copyright 2019 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +# Ideally, we would import things from Chromium Git. However, checking out +# Chromium is *really* slow, so we use a local checkout instead. +origin = folder.origin() + +# Import all URL-related files, plus some parts of //base, primarily those +# related to string handling. +import_list = glob( + include = [ + "AUTHORS", + "LICENSE", + "base/compiler_specific.h", + "base/macros.h", + "base/debug/leak_annotations.h", + "base/no_destructor.h", + "base/optional.h", + "base/stl_util.h", + "base/template_util.h", + "base/strings/*.cc", + "base/strings/*.h", + "base/third_party/icu/**", + "build/build_config.h", + "url/*.cc", + "url/*.h", + "url/third_party/mozilla/**", + ], + exclude = [ + "url/url_idna_icu_alternatives*", + ], +) + +target_files = glob( + include = [ + "base/**", + "build/**", + "url/**", + "AUTHORS", + "LICENSE", + ], + exclude = [ + "**/BUILD", + ], +) + +# Those headers are pulled from //polyfill instead of copied from Chromium. +# Should be in sync with //polyfill/BUILD. +polyfilled_headers = [ + "base/base_export.h", + "base/component_export.h", + "base/debug/alias.h", + "base/export_template.h", + "base/logging.h", + "base/trace_event/memory_usage_estimator.h", +] + +transformations = [ + # Prefix the logging-related macros. + core.replace( + "${log}", + "GURL_${log}", + regex_groups = {"log": "\\bD?(LOG|CHECK|CHECK_(EQ|LT|GT|LE|GE|NE))\\b"}, + ), + core.replace("DCHECK_IS_ON", "GURL_DCHECK_IS_ON"), + core.replace("NOTREACHED()", "GURL_NOTREACHED()"), + + # Rename base:: to gurl_base:: + core.replace("namespace base ", "namespace gurl_base "), + core.replace("base::", "gurl_base::"), + + # Ugly hack. In Chromium, ICU is built with UChar = uint16_t. We can't + # really do that with the system ICU, so we have to work this around with a + # cast. + core.replace( + "src, src_len, output->data(),", + "(UChar*)src, src_len, (UChar*)output->data(),", + ), + + # Use system ICU. + core.replace( + '"third_party/icu/source/common/unicode/${file}.h"', + "<unicode/${file}.h>", + regex_groups = {"file": "\w+"}, + ), +] + +transformations += [ + core.replace('#include "%s"' % header, '#include "polyfills/%s"' % header) + for header in polyfilled_headers +] + +core.workflow( + name = "import", + origin = origin, + origin_files = import_list, + destination = folder.destination(), + destination_files = target_files, + authoring = authoring.pass_thru("GURL Maintainers <noreply@google.com>"), + mode = "SQUASH", + transformations = transformations, +)
diff --git a/polyfills/BUILD b/polyfills/BUILD new file mode 100644 index 0000000..e80d717 --- /dev/null +++ b/polyfills/BUILD
@@ -0,0 +1,16 @@ +# Copyright 2019 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +cc_library( + name = "polyfills", + hdrs = [ + "base/base_export.h", + "base/component_export.h", + "base/debug/alias.h", + "base/export_template.h", + "base/logging.h", + "base/trace_event/memory_usage_estimator.h", + ], + visibility = ["//visibility:public"], +)
diff --git a/polyfills/base/base_export.h b/polyfills/base/base_export.h new file mode 100644 index 0000000..209e910 --- /dev/null +++ b/polyfills/base/base_export.h
@@ -0,0 +1,10 @@ +// Copyright (c) 2019 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef POLYFILLS_BASE_BASE_EXPORT_H_ +#define POLYFILLS_BASE_BASE_EXPORT_H_ + +#define BASE_EXPORT + +#endif /* POLYFILLS_BASE_BASE_EXPORT_H_ */
diff --git a/polyfills/base/component_export.h b/polyfills/base/component_export.h new file mode 100644 index 0000000..3ce2ab1 --- /dev/null +++ b/polyfills/base/component_export.h
@@ -0,0 +1,10 @@ +// Copyright (c) 2019 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef POLYFILLS_BASE_COMPONENT_EXPORT_H_ +#define POLYFILLS_BASE_COMPONENT_EXPORT_H_ + +#define COMPONENT_EXPORT(component) + +#endif /* POLYFILLS_BASE_COMPONENT_EXPORT_H_ */
diff --git a/polyfills/base/debug/alias.h b/polyfills/base/debug/alias.h new file mode 100644 index 0000000..df9b5dc --- /dev/null +++ b/polyfills/base/debug/alias.h
@@ -0,0 +1,10 @@ +// Copyright (c) 2019 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef POLYFILLS_BASE_DEBUG_ALIAS_H_ +#define POLYFILLS_BASE_DEBUG_ALIAS_H_ + +#define DEBUG_ALIAS_FOR_CSTR(var_name, c_str, char_count) + +#endif // POLYFILLS_BASE_DEBUG_ALIAS_H_
diff --git a/polyfills/base/export_template.h b/polyfills/base/export_template.h new file mode 100644 index 0000000..2b56e07 --- /dev/null +++ b/polyfills/base/export_template.h
@@ -0,0 +1,11 @@ +// Copyright (c) 2019 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef POLYFILLS_BASE_EXPORT_TEMPLATE_H_ +#define POLYFILLS_BASE_EXPORT_TEMPLATE_H_ + +#define EXPORT_TEMPLATE_DEFINE(export) +#define EXPORT_TEMPLATE_DECLARE(export) + +#endif /* POLYFILLS_BASE_EXPORT_TEMPLATE_H_ */
diff --git a/polyfills/base/logging.h b/polyfills/base/logging.h new file mode 100644 index 0000000..def1745 --- /dev/null +++ b/polyfills/base/logging.h
@@ -0,0 +1,36 @@ +// Copyright (c) 2019 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef POLYFILLS_BASE_LOGGING_H_ +#define POLYFILLS_BASE_LOGGING_H_ + +// The upstream header includes this, and some of the copied files actually rely +// on this. +#include <string.h> + +class GurlFakeLogSink { + public: + template <typename T1> + GurlFakeLogSink(T1) {} + template <typename T1, typename T2> + GurlFakeLogSink(T1, T2) {} + + template<typename T> + GurlFakeLogSink& operator<<(const T&) { return *this; } +}; + +#define GURL_CHECK_LE(statement, statement2) GurlFakeLogSink({statement, statement2}) +#define GURL_CHECK_NE(statement, statement2) GurlFakeLogSink({statement, statement2}) +#define GURL_CHECK(statement) GurlFakeLogSink({statement}) +#define GURL_DCHECK_EQ(statement, statement2) GurlFakeLogSink({statement, statement2}) +#define GURL_DCHECK_GT(statement, statement2) GurlFakeLogSink({statement, statement2}) +#define GURL_DCHECK_IS_ON() false +#define GURL_DCHECK_LE(statement, statement2) GurlFakeLogSink({statement, statement2}) +#define GURL_DCHECK_LT(statement, statement2) GurlFakeLogSink({statement, statement2}) +#define GURL_DCHECK(statement) GurlFakeLogSink({statement}) +#define GURL_DLOG(severity) GurlFakeLogSink(true) +#define GURL_LOG(severity) GurlFakeLogSink(true) +#define GURL_NOTREACHED() + +#endif /* POLYFILLS_BASE_LOGGING_H_ */
diff --git a/polyfills/base/trace_event/memory_usage_estimator.h b/polyfills/base/trace_event/memory_usage_estimator.h new file mode 100644 index 0000000..6ef1bc3 --- /dev/null +++ b/polyfills/base/trace_event/memory_usage_estimator.h
@@ -0,0 +1,17 @@ +// Copyright (c) 2019 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef POLYFILLS_BASE_TRACE_EVENT_MEMORY_USAGE_ESTIMATOR_H_ +#define POLYFILLS_BASE_TRACE_EVENT_MEMORY_USAGE_ESTIMATOR_H_ + +namespace gurl_base { +namespace trace_event { + +template <class T> +size_t EstimateMemoryUsage(const T& object) { return 0; } + +} // namespace trace_event +} // namespace base + +#endif /* POLYFILLS_BASE_TRACE_EVENT_MEMORY_USAGE_ESTIMATOR_H_ */
diff --git a/test/BUILD b/test/BUILD new file mode 100644 index 0000000..05c578b --- /dev/null +++ b/test/BUILD
@@ -0,0 +1,9 @@ +# Copyright 2019 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +cc_test( + name = "basic_test", + srcs = ["basic_test.cc"], + deps = ["//url"], +)
diff --git a/test/basic_test.cc b/test/basic_test.cc new file mode 100644 index 0000000..f60df57 --- /dev/null +++ b/test/basic_test.cc
@@ -0,0 +1,35 @@ +// Copyright (c) 2019 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Basic smoke test to ensure that GURL works properly. + +#include "url/gurl.h" + +#include <cstdlib> +#include <iostream> + +#define ASSERT_EQ(v1, v2) \ + if ((v1) != (v2)) { \ + std::cerr << "Expected equality of" << std::endl \ + << " " << #v1 << " (equal to " << (v1) << ")" << std::endl \ + << "and" << std::endl \ + << " " << #v2 << " (equal to " << (v2) << ")" << std::endl; \ + return 1; \ + } + +int main(int argc, char** argv) { + GURL url("https://example.org/test?foo=bar#section"); + ASSERT_EQ(url.scheme(), "https"); + ASSERT_EQ(url.host(), "example.org"); + ASSERT_EQ(url.EffectiveIntPort(), 443); + ASSERT_EQ(url.path(), "/test"); + ASSERT_EQ(url.query(), "foo=bar"); + ASSERT_EQ(url.ref(), "section"); + + // Ensure ICU is functioning correctly. + GURL idn_url("https://\xe5\x85\x89.example/"); + ASSERT_EQ(idn_url.spec(), "https://xn--54q.example/"); + + return 0; +}
diff --git a/url/BUILD b/url/BUILD new file mode 100644 index 0000000..ec01ee3 --- /dev/null +++ b/url/BUILD
@@ -0,0 +1,51 @@ +# Copyright 2019 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +cc_library( + name = "url", + srcs = [ + "gurl.cc", + "third_party/mozilla/url_parse.cc", + "url_canon.cc", + "url_canon_etc.cc", + "url_canon_filesystemurl.cc", + "url_canon_fileurl.cc", + "url_canon_host.cc", + "url_canon_internal.cc", + "url_canon_internal.h", + "url_canon_internal_file.h", + "url_canon_ip.cc", + "url_canon_mailtourl.cc", + "url_canon_path.cc", + "url_canon_pathurl.cc", + "url_canon_query.cc", + "url_canon_relative.cc", + "url_canon_stdstring.cc", + "url_canon_stdurl.cc", + "url_constants.cc", + "url_idna_icu.cc", + "url_parse_file.cc", + "url_parse_internal.h", + "url_util.cc", + "url_util_internal.h", + ], + hdrs = [ + "gurl.h", + "third_party/mozilla/url_parse.h", + "url_canon.h", + "url_canon_icu.h", + "url_canon_ip.h", + "url_canon_stdstring.h", + "url_constants.h", + "url_file.h", + "url_util.h", + ], + linkopts = ["-licuuc"], + visibility = ["//visibility:public"], + deps = [ + "//base", + "//base/strings", + "//polyfills", + ], +)
diff --git a/url/gurl.cc b/url/gurl.cc new file mode 100644 index 0000000..c8e424f --- /dev/null +++ b/url/gurl.cc
@@ -0,0 +1,532 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/gurl.h" + +#include <stddef.h> + +#include <algorithm> +#include <ostream> +#include <utility> + +#include "polyfills/base/logging.h" +#include "base/no_destructor.h" +#include "base/strings/string_piece.h" +#include "base/strings/string_util.h" +#include "polyfills/base/trace_event/memory_usage_estimator.h" +#include "url/url_canon_stdstring.h" +#include "url/url_util.h" + +GURL::GURL() : is_valid_(false) { +} + +GURL::GURL(const GURL& other) + : spec_(other.spec_), + is_valid_(other.is_valid_), + parsed_(other.parsed_) { + if (other.inner_url_) + inner_url_.reset(new GURL(*other.inner_url_)); + // Valid filesystem urls should always have an inner_url_. + GURL_DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_); +} + +GURL::GURL(GURL&& other) noexcept + : spec_(std::move(other.spec_)), + is_valid_(other.is_valid_), + parsed_(other.parsed_), + inner_url_(std::move(other.inner_url_)) { + other.is_valid_ = false; + other.parsed_ = url::Parsed(); +} + +GURL::GURL(gurl_base::StringPiece url_string) { + InitCanonical(url_string, true); +} + +GURL::GURL(gurl_base::StringPiece16 url_string) { + InitCanonical(url_string, true); +} + +GURL::GURL(const std::string& url_string, RetainWhiteSpaceSelector) { + InitCanonical(gurl_base::StringPiece(url_string), false); +} + +GURL::GURL(const char* canonical_spec, + size_t canonical_spec_len, + const url::Parsed& parsed, + bool is_valid) + : spec_(canonical_spec, canonical_spec_len), + is_valid_(is_valid), + parsed_(parsed) { + InitializeFromCanonicalSpec(); +} + +GURL::GURL(std::string canonical_spec, const url::Parsed& parsed, bool is_valid) + : spec_(std::move(canonical_spec)), is_valid_(is_valid), parsed_(parsed) { + InitializeFromCanonicalSpec(); +} + +template<typename STR> +void GURL::InitCanonical(gurl_base::BasicStringPiece<STR> input_spec, + bool trim_path_end) { + url::StdStringCanonOutput output(&spec_); + is_valid_ = url::Canonicalize( + input_spec.data(), static_cast<int>(input_spec.length()), trim_path_end, + NULL, &output, &parsed_); + + output.Complete(); // Must be done before using string. + if (is_valid_ && SchemeIsFileSystem()) { + inner_url_.reset(new GURL(spec_.data(), parsed_.Length(), + *parsed_.inner_parsed(), true)); + } + // Valid URLs always have non-empty specs. + GURL_DCHECK(!is_valid_ || !spec_.empty()); +} + +void GURL::InitializeFromCanonicalSpec() { + if (is_valid_ && SchemeIsFileSystem()) { + inner_url_.reset( + new GURL(spec_.data(), parsed_.Length(), + *parsed_.inner_parsed(), true)); + } + +#ifndef NDEBUG + // For testing purposes, check that the parsed canonical URL is identical to + // what we would have produced. Skip checking for invalid URLs have no meaning + // and we can't always canonicalize then reproducibly. + if (is_valid_) { + GURL_DCHECK(!spec_.empty()); + url::Component scheme; + // We can't do this check on the inner_url of a filesystem URL, as + // canonical_spec actually points to the start of the outer URL, so we'd + // end up with infinite recursion in this constructor. + if (!url::FindAndCompareScheme(spec_.data(), spec_.length(), + url::kFileSystemScheme, &scheme) || + scheme.begin == parsed_.scheme.begin) { + // We need to retain trailing whitespace on path URLs, as the |parsed_| + // spec we originally received may legitimately contain trailing white- + // space on the path or components e.g. if the #ref has been + // removed from a "foo:hello #ref" URL (see http://crbug.com/291747). + GURL test_url(spec_, RETAIN_TRAILING_PATH_WHITEPACE); + + GURL_DCHECK(test_url.is_valid_ == is_valid_); + GURL_DCHECK(test_url.spec_ == spec_); + + GURL_DCHECK(test_url.parsed_.scheme == parsed_.scheme); + GURL_DCHECK(test_url.parsed_.username == parsed_.username); + GURL_DCHECK(test_url.parsed_.password == parsed_.password); + GURL_DCHECK(test_url.parsed_.host == parsed_.host); + GURL_DCHECK(test_url.parsed_.port == parsed_.port); + GURL_DCHECK(test_url.parsed_.path == parsed_.path); + GURL_DCHECK(test_url.parsed_.query == parsed_.query); + GURL_DCHECK(test_url.parsed_.ref == parsed_.ref); + } + } +#endif +} + +GURL::~GURL() = default; + +GURL& GURL::operator=(const GURL& other) { + spec_ = other.spec_; + is_valid_ = other.is_valid_; + parsed_ = other.parsed_; + + if (!other.inner_url_) + inner_url_.reset(); + else if (inner_url_) + *inner_url_ = *other.inner_url_; + else + inner_url_.reset(new GURL(*other.inner_url_)); + + return *this; +} + +GURL& GURL::operator=(GURL&& other) noexcept { + spec_ = std::move(other.spec_); + is_valid_ = other.is_valid_; + parsed_ = other.parsed_; + inner_url_ = std::move(other.inner_url_); + + other.is_valid_ = false; + other.parsed_ = url::Parsed(); + return *this; +} + +const std::string& GURL::spec() const { + if (is_valid_ || spec_.empty()) + return spec_; + + GURL_DCHECK(false) << "Trying to get the spec of an invalid URL!"; + return gurl_base::EmptyString(); +} + +bool GURL::operator<(const GURL& other) const { + return spec_ < other.spec_; +} + +bool GURL::operator>(const GURL& other) const { + return spec_ > other.spec_; +} + +// Note: code duplicated below (it's inconvenient to use a template here). +GURL GURL::Resolve(gurl_base::StringPiece relative) const { + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + GURL result; + url::StdStringCanonOutput output(&result.spec_); + if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()), + parsed_, relative.data(), + static_cast<int>(relative.length()), + nullptr, &output, &result.parsed_)) { + // Error resolving, return an empty URL. + return GURL(); + } + + output.Complete(); + result.is_valid_ = true; + if (result.SchemeIsFileSystem()) { + result.inner_url_.reset( + new GURL(result.spec_.data(), result.parsed_.Length(), + *result.parsed_.inner_parsed(), true)); + } + return result; +} + +// Note: code duplicated above (it's inconvenient to use a template here). +GURL GURL::Resolve(gurl_base::StringPiece16 relative) const { + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + GURL result; + url::StdStringCanonOutput output(&result.spec_); + if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()), + parsed_, relative.data(), + static_cast<int>(relative.length()), + nullptr, &output, &result.parsed_)) { + // Error resolving, return an empty URL. + return GURL(); + } + + output.Complete(); + result.is_valid_ = true; + if (result.SchemeIsFileSystem()) { + result.inner_url_.reset( + new GURL(result.spec_.data(), result.parsed_.Length(), + *result.parsed_.inner_parsed(), true)); + } + return result; +} + +// Note: code duplicated below (it's inconvenient to use a template here). +GURL GURL::ReplaceComponents( + const url::Replacements<char>& replacements) const { + GURL result; + + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + url::StdStringCanonOutput output(&result.spec_); + result.is_valid_ = url::ReplaceComponents( + spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements, + NULL, &output, &result.parsed_); + + output.Complete(); + if (result.is_valid_ && result.SchemeIsFileSystem()) { + result.inner_url_.reset(new GURL(result.spec_.data(), + result.parsed_.Length(), + *result.parsed_.inner_parsed(), true)); + } + return result; +} + +// Note: code duplicated above (it's inconvenient to use a template here). +GURL GURL::ReplaceComponents( + const url::Replacements<gurl_base::char16>& replacements) const { + GURL result; + + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + url::StdStringCanonOutput output(&result.spec_); + result.is_valid_ = url::ReplaceComponents( + spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements, + NULL, &output, &result.parsed_); + + output.Complete(); + if (result.is_valid_ && result.SchemeIsFileSystem()) { + result.inner_url_.reset(new GURL(result.spec_.data(), + result.parsed_.Length(), + *result.parsed_.inner_parsed(), true)); + } + return result; +} + +GURL GURL::GetOrigin() const { + // This doesn't make sense for invalid or nonstandard URLs, so return + // the empty URL. + if (!is_valid_ || !IsStandard()) + return GURL(); + + if (SchemeIsFileSystem()) + return inner_url_->GetOrigin(); + + url::Replacements<char> replacements; + replacements.ClearUsername(); + replacements.ClearPassword(); + replacements.ClearPath(); + replacements.ClearQuery(); + replacements.ClearRef(); + + return ReplaceComponents(replacements); +} + +GURL GURL::GetAsReferrer() const { + if (!SchemeIsValidForReferrer()) + return GURL(); + + if (!has_ref() && !has_username() && !has_password()) + return GURL(*this); + + url::Replacements<char> replacements; + replacements.ClearRef(); + replacements.ClearUsername(); + replacements.ClearPassword(); + return ReplaceComponents(replacements); +} + +GURL GURL::GetWithEmptyPath() const { + // This doesn't make sense for invalid or nonstandard URLs, so return + // the empty URL. + if (!is_valid_ || !IsStandard()) + return GURL(); + + // We could optimize this since we know that the URL is canonical, and we are + // appending a canonical path, so avoiding re-parsing. + GURL other(*this); + if (parsed_.path.len == 0) + return other; + + // Clear everything after the path. + other.parsed_.query.reset(); + other.parsed_.ref.reset(); + + // Set the path, since the path is longer than one, we can just set the + // first character and resize. + other.spec_[other.parsed_.path.begin] = '/'; + other.parsed_.path.len = 1; + other.spec_.resize(other.parsed_.path.begin + 1); + return other; +} + +GURL GURL::GetWithoutFilename() const { + return Resolve("."); +} + +bool GURL::IsStandard() const { + return url::IsStandard(spec_.data(), parsed_.scheme); +} + +bool GURL::IsAboutBlank() const { + return IsAboutUrl(url::kAboutBlankPath); +} + +bool GURL::IsAboutSrcdoc() const { + return IsAboutUrl(url::kAboutSrcdocPath); +} + +bool GURL::SchemeIs(gurl_base::StringPiece lower_ascii_scheme) const { + GURL_DCHECK(gurl_base::IsStringASCII(lower_ascii_scheme)); + GURL_DCHECK(gurl_base::ToLowerASCII(lower_ascii_scheme) == lower_ascii_scheme); + + if (parsed_.scheme.len <= 0) + return lower_ascii_scheme.empty(); + return scheme_piece() == lower_ascii_scheme; +} + +bool GURL::SchemeIsHTTPOrHTTPS() const { + return SchemeIs(url::kHttpScheme) || SchemeIs(url::kHttpsScheme); +} + +bool GURL::SchemeIsValidForReferrer() const { + return is_valid_ && IsReferrerScheme(spec_.data(), parsed_.scheme); +} + +bool GURL::SchemeIsWSOrWSS() const { + return SchemeIs(url::kWsScheme) || SchemeIs(url::kWssScheme); +} + +bool GURL::SchemeIsCryptographic() const { + if (parsed_.scheme.len <= 0) + return false; + return SchemeIsCryptographic(scheme_piece()); +} + +bool GURL::SchemeIsCryptographic(gurl_base::StringPiece lower_ascii_scheme) { + GURL_DCHECK(gurl_base::IsStringASCII(lower_ascii_scheme)); + GURL_DCHECK(gurl_base::ToLowerASCII(lower_ascii_scheme) == lower_ascii_scheme); + + return lower_ascii_scheme == url::kHttpsScheme || + lower_ascii_scheme == url::kWssScheme; +} + +int GURL::IntPort() const { + if (parsed_.port.is_nonempty()) + return url::ParsePort(spec_.data(), parsed_.port); + return url::PORT_UNSPECIFIED; +} + +int GURL::EffectiveIntPort() const { + int int_port = IntPort(); + if (int_port == url::PORT_UNSPECIFIED && IsStandard()) + return url::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin, + parsed_.scheme.len); + return int_port; +} + +std::string GURL::ExtractFileName() const { + url::Component file_component; + url::ExtractFileName(spec_.data(), parsed_.path, &file_component); + return ComponentString(file_component); +} + +std::string GURL::PathForRequest() const { + GURL_DCHECK(parsed_.path.len > 0) + << "Canonical path for requests should be non-empty"; + if (parsed_.ref.len >= 0) { + // Clip off the reference when it exists. The reference starts after the + // #-sign, so we have to subtract one to also remove it. + return std::string(spec_, parsed_.path.begin, + parsed_.ref.begin - parsed_.path.begin - 1); + } + // Compute the actual path length, rather than depending on the spec's + // terminator. If we're an inner_url, our spec continues on into our outer + // URL's path/query/ref. + int path_len = parsed_.path.len; + if (parsed_.query.is_valid()) + path_len = parsed_.query.end() - parsed_.path.begin; + + return std::string(spec_, parsed_.path.begin, path_len); +} + +std::string GURL::HostNoBrackets() const { + return HostNoBracketsPiece().as_string(); +} + +gurl_base::StringPiece GURL::HostNoBracketsPiece() const { + // If host looks like an IPv6 literal, strip the square brackets. + url::Component h(parsed_.host); + if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') { + h.begin++; + h.len -= 2; + } + return ComponentStringPiece(h); +} + +std::string GURL::GetContent() const { + if (!is_valid_) + return std::string(); + std::string content = ComponentString(parsed_.GetContent()); + if (!SchemeIs(url::kJavaScriptScheme) && parsed_.ref.len >= 0) + content.erase(content.size() - parsed_.ref.len - 1); + return content; +} + +bool GURL::HostIsIPAddress() const { + return is_valid_ && url::HostIsIPAddress(host_piece()); +} + +const GURL& GURL::EmptyGURL() { + static gurl_base::NoDestructor<GURL> empty_gurl; + return *empty_gurl; +} + +bool GURL::DomainIs(gurl_base::StringPiece canonical_domain) const { + if (!is_valid_) + return false; + + // FileSystem URLs have empty host_piece, so check this first. + if (inner_url_ && SchemeIsFileSystem()) + return inner_url_->DomainIs(canonical_domain); + return url::DomainIs(host_piece(), canonical_domain); +} + +bool GURL::EqualsIgnoringRef(const GURL& other) const { + int ref_position = parsed_.CountCharactersBefore(url::Parsed::REF, true); + int ref_position_other = + other.parsed_.CountCharactersBefore(url::Parsed::REF, true); + return gurl_base::StringPiece(spec_).substr(0, ref_position) == + gurl_base::StringPiece(other.spec_).substr(0, ref_position_other); +} + +void GURL::Swap(GURL* other) { + spec_.swap(other->spec_); + std::swap(is_valid_, other->is_valid_); + std::swap(parsed_, other->parsed_); + inner_url_.swap(other->inner_url_); +} + +size_t GURL::EstimateMemoryUsage() const { + return gurl_base::trace_event::EstimateMemoryUsage(spec_) + + gurl_base::trace_event::EstimateMemoryUsage(inner_url_) + + (parsed_.inner_parsed() ? sizeof(url::Parsed) : 0); +} + +bool GURL::IsAboutUrl(gurl_base::StringPiece allowed_path) const { + if (!SchemeIs(url::kAboutScheme)) + return false; + + if (has_host() || has_username() || has_password() || has_port()) + return false; + + if (!path_piece().starts_with(allowed_path)) + return false; + + if (path_piece().size() == allowed_path.size()) { + GURL_DCHECK_EQ(path_piece(), allowed_path); + return true; + } + + if ((path_piece().size() == allowed_path.size() + 1) && + path_piece().back() == '/') { + GURL_DCHECK_EQ(path_piece(), allowed_path.as_string() + '/'); + return true; + } + + return false; +} + +std::ostream& operator<<(std::ostream& out, const GURL& url) { + return out << url.possibly_invalid_spec(); +} + +bool operator==(const GURL& x, const GURL& y) { + return x.possibly_invalid_spec() == y.possibly_invalid_spec(); +} + +bool operator!=(const GURL& x, const GURL& y) { + return !(x == y); +} + +bool operator==(const GURL& x, const gurl_base::StringPiece& spec) { + GURL_DCHECK_EQ(GURL(spec).possibly_invalid_spec(), spec); + return x.possibly_invalid_spec() == spec; +} + +bool operator==(const gurl_base::StringPiece& spec, const GURL& x) { + return x == spec; +} + +bool operator!=(const GURL& x, const gurl_base::StringPiece& spec) { + return !(x == spec); +} + +bool operator!=(const gurl_base::StringPiece& spec, const GURL& x) { + return !(x == spec); +}
diff --git a/url/gurl.h b/url/gurl.h new file mode 100644 index 0000000..8c026f7 --- /dev/null +++ b/url/gurl.h
@@ -0,0 +1,507 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_GURL_H_ +#define URL_GURL_H_ + +#include <stddef.h> + +#include <iosfwd> +#include <memory> +#include <string> + +#include "polyfills/base/component_export.h" +#include "polyfills/base/debug/alias.h" +#include "base/strings/string16.h" +#include "base/strings/string_piece.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon.h" +#include "url/url_canon_stdstring.h" +#include "url/url_constants.h" + +// Represents a URL. GURL is Google's URL parsing library. +// +// A parsed canonicalized URL is guaranteed to be UTF-8. Any non-ASCII input +// characters are UTF-8 encoded and % escaped to ASCII. +// +// The string representation of a URL is called the spec(). Getting the +// spec will assert if the URL is invalid to help protect against malicious +// URLs. If you want the "best effort" canonicalization of an invalid URL, you +// can use possibly_invalid_spec(). Test validity with is_valid(). Data and +// javascript URLs use GetContent() to extract the data. +// +// This class has existence checkers and getters for the various components of +// a URL. Existence is different than being nonempty. "http://www.google.com/?" +// has a query that just happens to be empty, and has_query() will return true +// while the query getters will return the empty string. +// +// Prefer not to modify a URL using string operations (though sometimes this is +// unavoidable). Instead, use ReplaceComponents which can replace or delete +// multiple parts of a URL in one step, doesn't re-canonicalize unchanged +// sections, and avoids some screw-ups. An example is creating a URL with a +// path that contains a literal '#'. Using string concatenation will generate a +// URL with a truncated path and a reference fragment, while ReplaceComponents +// will know to escape this and produce the desired result. +class COMPONENT_EXPORT(URL) GURL { + public: + typedef url::StringPieceReplacements<std::string> Replacements; + typedef url::StringPieceReplacements<gurl_base::string16> ReplacementsW; + + // Creates an empty, invalid URL. + GURL(); + + // Copy construction is relatively inexpensive, with most of the time going + // to reallocating the string. It does not re-parse. + GURL(const GURL& other); + GURL(GURL&& other) noexcept; + + // The strings to this contructor should be UTF-8 / UTF-16. + explicit GURL(gurl_base::StringPiece url_string); + explicit GURL(gurl_base::StringPiece16 url_string); + + // Constructor for URLs that have already been parsed and canonicalized. This + // is used for conversions from KURL, for example. The caller must supply all + // information associated with the URL, which must be correct and consistent. + GURL(const char* canonical_spec, + size_t canonical_spec_len, + const url::Parsed& parsed, + bool is_valid); + // Notice that we take the canonical_spec by value so that we can convert + // from WebURL without copying the string. When we call this constructor + // we pass in a temporary std::string, which lets the compiler skip the + // copy and just move the std::string into the function argument. In the + // implementation, we use std::move to move the data into the GURL itself, + // which means we end up with zero copies. + GURL(std::string canonical_spec, const url::Parsed& parsed, bool is_valid); + + ~GURL(); + + GURL& operator=(const GURL& other); + GURL& operator=(GURL&& other) noexcept; + + // Returns true when this object represents a valid parsed URL. When not + // valid, other functions will still succeed, but you will not get canonical + // data out in the format you may be expecting. Instead, we keep something + // "reasonable looking" so that the user can see how it's busted if + // displayed to them. + bool is_valid() const { + return is_valid_; + } + + // Returns true if the URL is zero-length. Note that empty URLs are also + // invalid, and is_valid() will return false for them. This is provided + // because some users may want to treat the empty case differently. + bool is_empty() const { + return spec_.empty(); + } + + // Returns the raw spec, i.e., the full text of the URL, in canonical UTF-8, + // if the URL is valid. If the URL is not valid, this will assert and return + // the empty string (for safety in release builds, to keep them from being + // misused which might be a security problem). + // + // The URL will be ASCII (non-ASCII characters will be %-escaped UTF-8). + // + // The exception is for empty() URLs (which are !is_valid()) but this will + // return the empty string without asserting. + // + // Use invalid_spec() below to get the unusable spec of an invalid URL. This + // separation is designed to prevent errors that may cause security problems + // that could result from the mistaken use of an invalid URL. + const std::string& spec() const; + + // Returns the potentially invalid spec for a the URL. This spec MUST NOT be + // modified or sent over the network. It is designed to be displayed in error + // messages to the user, as the appearance of the spec may explain the error. + // If the spec is valid, the valid spec will be returned. + // + // The returned string is guaranteed to be valid UTF-8. + const std::string& possibly_invalid_spec() const { + return spec_; + } + + // Getter for the raw parsed structure. This allows callers to locate parts + // of the URL within the spec themselves. Most callers should consider using + // the individual component getters below. + // + // The returned parsed structure will reference into the raw spec, which may + // or may not be valid. If you are using this to index into the spec, BE + // SURE YOU ARE USING possibly_invalid_spec() to get the spec, and that you + // don't do anything "important" with invalid specs. + const url::Parsed& parsed_for_possibly_invalid_spec() const { + return parsed_; + } + + // Allows GURL to used as a key in STL (for example, a std::set or std::map). + bool operator<(const GURL& other) const; + bool operator>(const GURL& other) const; + + // Resolves a URL that's possibly relative to this object's URL, and returns + // it. Absolute URLs are also handled according to the rules of URLs on web + // pages. + // + // It may be impossible to resolve the URLs properly. If the input is not + // "standard" (IsStandard() == false) and the input looks relative, we can't + // resolve it. In these cases, the result will be an empty, invalid GURL. + // + // The result may also be a nonempty, invalid URL if the input has some kind + // of encoding error. In these cases, we will try to construct a "good" URL + // that may have meaning to the user, but it will be marked invalid. + // + // It is an error to resolve a URL relative to an invalid URL. The result + // will be the empty URL. + GURL Resolve(gurl_base::StringPiece relative) const; + GURL Resolve(gurl_base::StringPiece16 relative) const; + + // Creates a new GURL by replacing the current URL's components with the + // supplied versions. See the Replacements class in url_canon.h for more. + // + // These are not particularly quick, so avoid doing mutations when possible. + // Prefer the 8-bit version when possible. + // + // It is an error to replace components of an invalid URL. The result will + // be the empty URL. + // + // Note that we use the more general url::Replacements type to give + // callers extra flexibility rather than our override. + GURL ReplaceComponents(const url::Replacements<char>& replacements) const; + GURL ReplaceComponents( + const url::Replacements<gurl_base::char16>& replacements) const; + + // A helper function that is equivalent to replacing the path with a slash + // and clearing out everything after that. We sometimes need to know just the + // scheme and the authority. If this URL is not a standard URL (it doesn't + // have the regular authority and path sections), then the result will be + // an empty, invalid GURL. Note that this *does* work for file: URLs, which + // some callers may want to filter out before calling this. + // + // It is an error to get an empty path on an invalid URL. The result + // will be the empty URL. + GURL GetWithEmptyPath() const; + + // A helper function to return a GURL without the filename, query values, and + // fragment. For example, + // GURL("https://www.foo.com/index.html?q=test").GetWithoutFilename().spec() + // will return "https://www.foo.com/". + // GURL("https://www.foo.com/bar/").GetWithoutFilename().spec() + // will return "https://www.foo.com/bar/". If the GURL is invalid or missing a + // scheme, authority or path, it will return an empty, invalid GURL. + GURL GetWithoutFilename() const; + + // A helper function to return a GURL containing just the scheme, host, + // and port from a URL. Equivalent to clearing any username and password, + // replacing the path with a slash, and clearing everything after that. If + // this URL is not a standard URL, then the result will be an empty, + // invalid GURL. If the URL has neither username nor password, this + // degenerates to GetWithEmptyPath(). + // + // It is an error to get the origin of an invalid URL. The result + // will be the empty URL. + GURL GetOrigin() const; + + // A helper function to return a GURL stripped from the elements that are not + // supposed to be sent as HTTP referrer: username, password and ref fragment. + // For invalid URLs or URLs that no valid referrers, an empty URL will be + // returned. + GURL GetAsReferrer() const; + + // Returns true if the scheme for the current URL is a known "standard-format" + // scheme. A standard-format scheme adheres to what RFC 3986 calls "generic + // URI syntax" (https://tools.ietf.org/html/rfc3986#section-3). This includes + // file: and filesystem:, which some callers may want to filter out explicitly + // by calling SchemeIsFile[System]. + bool IsStandard() const; + + // Returns true when the url is of the form about:blank, about:blank?foo or + // about:blank/#foo. + bool IsAboutBlank() const; + + // Returns true when the url is of the form about:srcdoc, about:srcdoc?foo or + // about:srcdoc/#foo. + bool IsAboutSrcdoc() const; + + // Returns true if the given parameter (should be lower-case ASCII to match + // the canonicalized scheme) is the scheme for this URL. Do not include a + // colon. + bool SchemeIs(gurl_base::StringPiece lower_ascii_scheme) const; + + // Returns true if the scheme is "http" or "https". + bool SchemeIsHTTPOrHTTPS() const; + + // Returns true if the scheme is valid for use as a referrer. + bool SchemeIsValidForReferrer() const; + + // Returns true is the scheme is "ws" or "wss". + bool SchemeIsWSOrWSS() const; + + // We often need to know if this is a file URL. File URLs are "standard", but + // are often treated separately by some programs. + bool SchemeIsFile() const { + return SchemeIs(url::kFileScheme); + } + + // FileSystem URLs need to be treated differently in some cases. + bool SchemeIsFileSystem() const { + return SchemeIs(url::kFileSystemScheme); + } + + // Returns true if the scheme indicates a network connection that uses TLS or + // some other cryptographic protocol (e.g. QUIC) for security. + // + // This function is a not a complete test of whether or not an origin's code + // is minimally trustworthy. For that, see Chromium's |IsOriginSecure| for a + // higher-level and more complete semantics. See that function's documentation + // for more detail. + bool SchemeIsCryptographic() const; + + // As above, but static. Parameter should be lower-case ASCII. + static bool SchemeIsCryptographic(gurl_base::StringPiece lower_ascii_scheme); + + // Returns true if the scheme is "blob". + bool SchemeIsBlob() const { + return SchemeIs(url::kBlobScheme); + } + + // For most URLs, the "content" is everything after the scheme (skipping the + // scheme delimiting colon) and before the fragment (skipping the fragment + // delimiting octothorpe). For javascript URLs the "content" also includes the + // fragment delimiter and fragment. + // + // It is an error to get the content of an invalid URL: the result will be an + // empty string. + std::string GetContent() const; + + // Returns true if the hostname is an IP address. Note: this function isn't + // as cheap as a simple getter because it re-parses the hostname to verify. + bool HostIsIPAddress() const; + + // Not including the colon. If you are comparing schemes, prefer SchemeIs. + bool has_scheme() const { + return parsed_.scheme.len >= 0; + } + std::string scheme() const { + return ComponentString(parsed_.scheme); + } + gurl_base::StringPiece scheme_piece() const { + return ComponentStringPiece(parsed_.scheme); + } + + bool has_username() const { + return parsed_.username.len >= 0; + } + std::string username() const { + return ComponentString(parsed_.username); + } + gurl_base::StringPiece username_piece() const { + return ComponentStringPiece(parsed_.username); + } + + bool has_password() const { + return parsed_.password.len >= 0; + } + std::string password() const { + return ComponentString(parsed_.password); + } + gurl_base::StringPiece password_piece() const { + return ComponentStringPiece(parsed_.password); + } + + // The host may be a hostname, an IPv4 address, or an IPv6 literal surrounded + // by square brackets, like "[2001:db8::1]". To exclude these brackets, use + // HostNoBrackets() below. + bool has_host() const { + // Note that hosts are special, absence of host means length 0. + return parsed_.host.len > 0; + } + std::string host() const { + return ComponentString(parsed_.host); + } + gurl_base::StringPiece host_piece() const { + return ComponentStringPiece(parsed_.host); + } + + // The port if one is explicitly specified. Most callers will want IntPort() + // or EffectiveIntPort() instead of these. The getters will not include the + // ':'. + bool has_port() const { + return parsed_.port.len >= 0; + } + std::string port() const { + return ComponentString(parsed_.port); + } + gurl_base::StringPiece port_piece() const { + return ComponentStringPiece(parsed_.port); + } + + // Including first slash following host, up to the query. The URL + // "http://www.google.com/" has a path of "/". + bool has_path() const { + return parsed_.path.len >= 0; + } + std::string path() const { + return ComponentString(parsed_.path); + } + gurl_base::StringPiece path_piece() const { + return ComponentStringPiece(parsed_.path); + } + + // Stuff following '?' up to the ref. The getters will not include the '?'. + bool has_query() const { + return parsed_.query.len >= 0; + } + std::string query() const { + return ComponentString(parsed_.query); + } + gurl_base::StringPiece query_piece() const { + return ComponentStringPiece(parsed_.query); + } + + // Stuff following '#' to the end of the string. This will be %-escaped UTF-8. + // The getters will not include the '#'. + bool has_ref() const { + return parsed_.ref.len >= 0; + } + std::string ref() const { + return ComponentString(parsed_.ref); + } + gurl_base::StringPiece ref_piece() const { + return ComponentStringPiece(parsed_.ref); + } + + // Returns a parsed version of the port. Can also be any of the special + // values defined in Parsed for ExtractPort. + int IntPort() const; + + // Returns the port number of the URL, or the default port number. + // If the scheme has no concept of port (or unknown default) returns + // PORT_UNSPECIFIED. + int EffectiveIntPort() const; + + // Extracts the filename portion of the path and returns it. The filename + // is everything after the last slash in the path. This may be empty. + std::string ExtractFileName() const; + + // Returns the path that should be sent to the server. This is the path, + // parameter, and query portions of the URL. It is guaranteed to be ASCII. + std::string PathForRequest() const; + + // Returns the host, excluding the square brackets surrounding IPv6 address + // literals. This can be useful for passing to getaddrinfo(). + std::string HostNoBrackets() const; + + // Returns the same characters as HostNoBrackets(), avoiding a copy. + gurl_base::StringPiece HostNoBracketsPiece() const; + + // Returns true if this URL's host matches or is in the same domain as + // the given input string. For example, if the hostname of the URL is + // "www.google.com", this will return true for "com", "google.com", and + // "www.google.com". + // + // The input domain should match host canonicalization rules. i.e. the input + // should be lowercase except for escape chars. + // + // This call is more efficient than getting the host and checking whether the + // host has the specific domain or not because no copies or object + // constructions are done. + bool DomainIs(gurl_base::StringPiece canonical_domain) const; + + // Checks whether or not two URLs differ only in the ref (the part after + // the # character). + bool EqualsIgnoringRef(const GURL& other) const; + + // Swaps the contents of this GURL object with |other|, without doing + // any memory allocations. + void Swap(GURL* other); + + // Returns a reference to a singleton empty GURL. This object is for callers + // who return references but don't have anything to return in some cases. + // If you just want an empty URL for normal use, prefer GURL(). This function + // may be called from any thread. + static const GURL& EmptyGURL(); + + // Returns the inner URL of a nested URL (currently only non-null for + // filesystem URLs). + // + // TODO(mmenke): inner_url().spec() currently returns the same value as + // caling spec() on the GURL itself. This should be fixed. + // See https://crbug.com/619596 + const GURL* inner_url() const { + return inner_url_.get(); + } + + // Estimates dynamic memory usage. + // See base/trace_event/memory_usage_estimator.h for more info. + size_t EstimateMemoryUsage() const; + + private: + // Variant of the string parsing constructor that allows the caller to elect + // retain trailing whitespace, if any, on the passed URL spec, but only if + // the scheme is one that allows trailing whitespace. The primary use-case is + // for data: URLs. In most cases, you want to use the single parameter + // constructor above. + enum RetainWhiteSpaceSelector { RETAIN_TRAILING_PATH_WHITEPACE }; + GURL(const std::string& url_string, RetainWhiteSpaceSelector); + + template<typename STR> + void InitCanonical(gurl_base::BasicStringPiece<STR> input_spec, + bool trim_path_end); + + void InitializeFromCanonicalSpec(); + + // Helper used by IsAboutBlank and IsAboutSrcdoc. + bool IsAboutUrl(gurl_base::StringPiece allowed_path) const; + + // Returns the substring of the input identified by the given component. + std::string ComponentString(const url::Component& comp) const { + if (comp.len <= 0) + return std::string(); + return std::string(spec_, comp.begin, comp.len); + } + gurl_base::StringPiece ComponentStringPiece(const url::Component& comp) const { + if (comp.len <= 0) + return gurl_base::StringPiece(); + return gurl_base::StringPiece(&spec_[comp.begin], comp.len); + } + + // The actual text of the URL, in canonical ASCII form. + std::string spec_; + + // Set when the given URL is valid. Otherwise, we may still have a spec and + // components, but they may not identify valid resources (for example, an + // invalid port number, invalid characters in the scheme, etc.). + bool is_valid_; + + // Identified components of the canonical spec. + url::Parsed parsed_; + + // Used for nested schemes [currently only filesystem:]. + std::unique_ptr<GURL> inner_url_; +}; + +// Stream operator so GURL can be used in assertion statements. +COMPONENT_EXPORT(URL) +std::ostream& operator<<(std::ostream& out, const GURL& url); + +COMPONENT_EXPORT(URL) bool operator==(const GURL& x, const GURL& y); +COMPONENT_EXPORT(URL) bool operator!=(const GURL& x, const GURL& y); + +// Equality operator for comparing raw spec_. This should be used in place of +// url == GURL(spec) where |spec| is known (i.e. constants). This is to prevent +// needlessly re-parsing |spec| into a temporary GURL. +COMPONENT_EXPORT(URL) +bool operator==(const GURL& x, const gurl_base::StringPiece& spec); +COMPONENT_EXPORT(URL) +bool operator==(const gurl_base::StringPiece& spec, const GURL& x); +COMPONENT_EXPORT(URL) +bool operator!=(const GURL& x, const gurl_base::StringPiece& spec); +COMPONENT_EXPORT(URL) +bool operator!=(const gurl_base::StringPiece& spec, const GURL& x); + +// DEBUG_ALIAS_FOR_GURL(var_name, url) copies |url| into a new stack-allocated +// variable named |<var_name>|. This helps ensure that the value of |url| gets +// preserved in crash dumps. +#define DEBUG_ALIAS_FOR_GURL(var_name, url) \ + DEBUG_ALIAS_FOR_CSTR(var_name, (url).possibly_invalid_spec().c_str(), 128) + +#endif // URL_GURL_H_
diff --git a/url/gurl_fuzzer.cc b/url/gurl_fuzzer.cc new file mode 100644 index 0000000..71f3540 --- /dev/null +++ b/url/gurl_fuzzer.cc
@@ -0,0 +1,57 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/at_exit.h" +#include "base/i18n/icu_util.h" +#include "url/gurl.h" + +struct TestCase { + TestCase() { GURL_CHECK(gurl_base::i18n::InitializeICU()); } + + // used by ICU integration. + gurl_base::AtExitManager at_exit_manager; +}; + +TestCase* test_case = new TestCase(); + +// Entry point for LibFuzzer. +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + if (size < 1) + return 0; + + gurl_base::StringPiece string_piece_input(reinterpret_cast<const char*>(data), + size); + GURL url_from_string_piece(string_piece_input); + + // Test for StringPiece16 if size is even. + if (size % 2 == 0) { + gurl_base::StringPiece16 string_piece_input16( + reinterpret_cast<const gurl_base::char16*>(data), size / 2); + + GURL url_from_string_piece16(string_piece_input16); + } + + // Resolve relative url tests. + size_t size_t_bytes = sizeof(size_t); + if (size < size_t_bytes + 1) { + return 0; + } + size_t relative_size = + *reinterpret_cast<const size_t*>(data) % (size - size_t_bytes); + std::string relative_string( + reinterpret_cast<const char*>(data + size_t_bytes), relative_size); + gurl_base::StringPiece string_piece_part_input( + reinterpret_cast<const char*>(data + size_t_bytes + relative_size), + size - relative_size - size_t_bytes); + GURL url_from_string_piece_part(string_piece_part_input); + url_from_string_piece_part.Resolve(relative_string); + + if (relative_size % 2 == 0) { + gurl_base::string16 relative_string16( + reinterpret_cast<const gurl_base::char16*>(data + size_t_bytes), + relative_size / 2); + url_from_string_piece_part.Resolve(relative_string16); + } + return 0; +}
diff --git a/url/gurl_unittest.cc b/url/gurl_unittest.cc new file mode 100644 index 0000000..0375eae --- /dev/null +++ b/url/gurl_unittest.cc
@@ -0,0 +1,963 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <stddef.h> + +#include "base/stl_util.h" +#include "base/strings/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/gurl.h" +#include "url/url_canon.h" +#include "url/url_test_utils.h" + +namespace url { + +namespace { + +template<typename CHAR> +void SetupReplacement( + void (Replacements<CHAR>::*func)(const CHAR*, const Component&), + Replacements<CHAR>* replacements, + const CHAR* str) { + if (str) { + Component comp; + if (str[0]) + comp.len = static_cast<int>(strlen(str)); + (replacements->*func)(str, comp); + } +} + +// Returns the canonicalized string for the given URL string for the +// GURLTest.Types test. +std::string TypesTestCase(const char* src) { + GURL gurl(src); + return gurl.possibly_invalid_spec(); +} + +} // namespace + +// Different types of URLs should be handled differently, and handed off to +// different canonicalizers. +TEST(GURLTest, Types) { + // URLs with unknown schemes should be treated as path URLs, even when they + // have things like "://". + EXPECT_EQ("something:///HOSTNAME.com/", + TypesTestCase("something:///HOSTNAME.com/")); + + // Conversely, URLs with known schemes should always trigger standard URL + // handling. + EXPECT_EQ("http://hostname.com/", TypesTestCase("http:HOSTNAME.com")); + EXPECT_EQ("http://hostname.com/", TypesTestCase("http:/HOSTNAME.com")); + EXPECT_EQ("http://hostname.com/", TypesTestCase("http://HOSTNAME.com")); + EXPECT_EQ("http://hostname.com/", TypesTestCase("http:///HOSTNAME.com")); + +#ifdef WIN32 + // URLs that look like Windows absolute path specs. + EXPECT_EQ("file:///C:/foo.txt", TypesTestCase("c:\\foo.txt")); + EXPECT_EQ("file:///Z:/foo.txt", TypesTestCase("Z|foo.txt")); + EXPECT_EQ("file://server/foo.txt", TypesTestCase("\\\\server\\foo.txt")); + EXPECT_EQ("file://server/foo.txt", TypesTestCase("//server/foo.txt")); +#endif +} + +// Test the basic creation and querying of components in a GURL. We assume that +// the parser is already tested and works, so we are mostly interested if the +// object does the right thing with the results. +TEST(GURLTest, Components) { + GURL empty_url(gurl_base::UTF8ToUTF16("")); + EXPECT_TRUE(empty_url.is_empty()); + EXPECT_FALSE(empty_url.is_valid()); + + GURL url(gurl_base::UTF8ToUTF16("http://user:pass@google.com:99/foo;bar?q=a#ref")); + EXPECT_FALSE(url.is_empty()); + EXPECT_TRUE(url.is_valid()); + EXPECT_TRUE(url.SchemeIs("http")); + EXPECT_FALSE(url.SchemeIsFile()); + + // This is the narrow version of the URL, which should match the wide input. + EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url.spec()); + + EXPECT_EQ("http", url.scheme()); + EXPECT_EQ("user", url.username()); + EXPECT_EQ("pass", url.password()); + EXPECT_EQ("google.com", url.host()); + EXPECT_EQ("99", url.port()); + EXPECT_EQ(99, url.IntPort()); + EXPECT_EQ("/foo;bar", url.path()); + EXPECT_EQ("q=a", url.query()); + EXPECT_EQ("ref", url.ref()); + + // Test parsing userinfo with special characters. + GURL url_special_pass("http://user:%40!$&'()*+,;=:@google.com:12345"); + EXPECT_TRUE(url_special_pass.is_valid()); + // GURL canonicalizes some delimiters. + EXPECT_EQ("%40!$&%27()*+,%3B%3D%3A", url_special_pass.password()); + EXPECT_EQ("google.com", url_special_pass.host()); + EXPECT_EQ("12345", url_special_pass.port()); +} + +TEST(GURLTest, Empty) { + GURL url; + EXPECT_FALSE(url.is_valid()); + EXPECT_EQ("", url.spec()); + + EXPECT_EQ("", url.scheme()); + EXPECT_EQ("", url.username()); + EXPECT_EQ("", url.password()); + EXPECT_EQ("", url.host()); + EXPECT_EQ("", url.port()); + EXPECT_EQ(PORT_UNSPECIFIED, url.IntPort()); + EXPECT_EQ("", url.path()); + EXPECT_EQ("", url.query()); + EXPECT_EQ("", url.ref()); +} + +TEST(GURLTest, Copy) { + GURL url(gurl_base::UTF8ToUTF16( + "http://user:pass@google.com:99/foo;bar?q=a#ref")); + + GURL url2(url); + EXPECT_TRUE(url2.is_valid()); + + EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url2.spec()); + EXPECT_EQ("http", url2.scheme()); + EXPECT_EQ("user", url2.username()); + EXPECT_EQ("pass", url2.password()); + EXPECT_EQ("google.com", url2.host()); + EXPECT_EQ("99", url2.port()); + EXPECT_EQ(99, url2.IntPort()); + EXPECT_EQ("/foo;bar", url2.path()); + EXPECT_EQ("q=a", url2.query()); + EXPECT_EQ("ref", url2.ref()); + + // Copying of invalid URL should be invalid + GURL invalid; + GURL invalid2(invalid); + EXPECT_FALSE(invalid2.is_valid()); + EXPECT_EQ("", invalid2.spec()); + EXPECT_EQ("", invalid2.scheme()); + EXPECT_EQ("", invalid2.username()); + EXPECT_EQ("", invalid2.password()); + EXPECT_EQ("", invalid2.host()); + EXPECT_EQ("", invalid2.port()); + EXPECT_EQ(PORT_UNSPECIFIED, invalid2.IntPort()); + EXPECT_EQ("", invalid2.path()); + EXPECT_EQ("", invalid2.query()); + EXPECT_EQ("", invalid2.ref()); +} + +TEST(GURLTest, Assign) { + GURL url(gurl_base::UTF8ToUTF16( + "http://user:pass@google.com:99/foo;bar?q=a#ref")); + + GURL url2; + url2 = url; + EXPECT_TRUE(url2.is_valid()); + + EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url2.spec()); + EXPECT_EQ("http", url2.scheme()); + EXPECT_EQ("user", url2.username()); + EXPECT_EQ("pass", url2.password()); + EXPECT_EQ("google.com", url2.host()); + EXPECT_EQ("99", url2.port()); + EXPECT_EQ(99, url2.IntPort()); + EXPECT_EQ("/foo;bar", url2.path()); + EXPECT_EQ("q=a", url2.query()); + EXPECT_EQ("ref", url2.ref()); + + // Assignment of invalid URL should be invalid + GURL invalid; + GURL invalid2; + invalid2 = invalid; + EXPECT_FALSE(invalid2.is_valid()); + EXPECT_EQ("", invalid2.spec()); + EXPECT_EQ("", invalid2.scheme()); + EXPECT_EQ("", invalid2.username()); + EXPECT_EQ("", invalid2.password()); + EXPECT_EQ("", invalid2.host()); + EXPECT_EQ("", invalid2.port()); + EXPECT_EQ(PORT_UNSPECIFIED, invalid2.IntPort()); + EXPECT_EQ("", invalid2.path()); + EXPECT_EQ("", invalid2.query()); + EXPECT_EQ("", invalid2.ref()); +} + +// This is a regression test for http://crbug.com/309975. +TEST(GURLTest, SelfAssign) { + GURL a("filesystem:http://example.com/temporary/"); + // This should not crash. + a = *&a; // The *& defeats Clang's -Wself-assign warning. +} + +TEST(GURLTest, CopyFileSystem) { + GURL url(gurl_base::UTF8ToUTF16( + "filesystem:https://user:pass@google.com:99/t/foo;bar?q=a#ref")); + + GURL url2(url); + EXPECT_TRUE(url2.is_valid()); + + EXPECT_EQ("filesystem:https://google.com:99/t/foo;bar?q=a#ref", url2.spec()); + EXPECT_EQ("filesystem", url2.scheme()); + EXPECT_EQ("", url2.username()); + EXPECT_EQ("", url2.password()); + EXPECT_EQ("", url2.host()); + EXPECT_EQ("", url2.port()); + EXPECT_EQ(PORT_UNSPECIFIED, url2.IntPort()); + EXPECT_EQ("/foo;bar", url2.path()); + EXPECT_EQ("q=a", url2.query()); + EXPECT_EQ("ref", url2.ref()); + + const GURL* inner = url2.inner_url(); + ASSERT_TRUE(inner); + EXPECT_EQ("https", inner->scheme()); + EXPECT_EQ("", inner->username()); + EXPECT_EQ("", inner->password()); + EXPECT_EQ("google.com", inner->host()); + EXPECT_EQ("99", inner->port()); + EXPECT_EQ(99, inner->IntPort()); + EXPECT_EQ("/t", inner->path()); + EXPECT_EQ("", inner->query()); + EXPECT_EQ("", inner->ref()); +} + +TEST(GURLTest, IsValid) { + const char* valid_cases[] = { + "http://google.com", + "unknown://google.com", + "http://user:pass@google.com", + "http://google.com:12345", + "http://google.com/path", + "http://google.com//path", + "http://google.com?k=v#fragment", + "http://user:pass@google.com:12345/path?k=v#fragment", + "http:/path", + "http:path", + }; + for (size_t i = 0; i < gurl_base::size(valid_cases); i++) { + EXPECT_TRUE(GURL(valid_cases[i]).is_valid()) + << "Case: " << valid_cases[i]; + } + + const char* invalid_cases[] = { + "http://?k=v", + "http:://google.com", + "http//google.com", + "http://google.com:12three45", + "://google.com", + "path", + }; + for (size_t i = 0; i < gurl_base::size(invalid_cases); i++) { + EXPECT_FALSE(GURL(invalid_cases[i]).is_valid()) + << "Case: " << invalid_cases[i]; + } +} + +TEST(GURLTest, ExtraSlashesBeforeAuthority) { + // According to RFC3986, the hierarchical part for URI with an authority + // must use only two slashes; GURL intentionally just ignores extra slashes + // if there are more than 2, and parses the following part as an authority. + GURL url("http:///host"); + EXPECT_EQ("host", url.host()); + EXPECT_EQ("/", url.path()); +} + +// Given an invalid URL, we should still get most of the components. +TEST(GURLTest, ComponentGettersWorkEvenForInvalidURL) { + GURL url("http:google.com:foo"); + EXPECT_FALSE(url.is_valid()); + EXPECT_EQ("http://google.com:foo/", url.possibly_invalid_spec()); + + EXPECT_EQ("http", url.scheme()); + EXPECT_EQ("", url.username()); + EXPECT_EQ("", url.password()); + EXPECT_EQ("google.com", url.host()); + EXPECT_EQ("foo", url.port()); + EXPECT_EQ(PORT_INVALID, url.IntPort()); + EXPECT_EQ("/", url.path()); + EXPECT_EQ("", url.query()); + EXPECT_EQ("", url.ref()); +} + +TEST(GURLTest, Resolve) { + // The tricky cases for relative URL resolving are tested in the + // canonicalizer unit test. Here, we just test that the GURL integration + // works properly. + struct ResolveCase { + const char* base; + const char* relative; + bool expected_valid; + const char* expected; + } resolve_cases[] = { + {"http://www.google.com/", "foo.html", true, "http://www.google.com/foo.html"}, + {"http://www.google.com/foo/", "bar", true, "http://www.google.com/foo/bar"}, + {"http://www.google.com/foo/", "/bar", true, "http://www.google.com/bar"}, + {"http://www.google.com/foo", "bar", true, "http://www.google.com/bar"}, + {"http://www.google.com/", "http://images.google.com/foo.html", true, "http://images.google.com/foo.html"}, + {"http://www.google.com/", "http://images.\tgoogle.\ncom/\rfoo.html", true, "http://images.google.com/foo.html"}, + {"http://www.google.com/blah/bloo?c#d", "../../../hello/./world.html?a#b", true, "http://www.google.com/hello/world.html?a#b"}, + {"http://www.google.com/foo#bar", "#com", true, "http://www.google.com/foo#com"}, + {"http://www.google.com/", "Https:images.google.com", true, "https://images.google.com/"}, + // A non-standard base can be replaced with a standard absolute URL. + {"data:blahblah", "http://google.com/", true, "http://google.com/"}, + {"data:blahblah", "http:google.com", true, "http://google.com/"}, + // Filesystem URLs have different paths to test. + {"filesystem:http://www.google.com/type/", "foo.html", true, "filesystem:http://www.google.com/type/foo.html"}, + {"filesystem:http://www.google.com/type/", "../foo.html", true, "filesystem:http://www.google.com/type/foo.html"}, + }; + + for (size_t i = 0; i < gurl_base::size(resolve_cases); i++) { + // 8-bit code path. + GURL input(resolve_cases[i].base); + GURL output = input.Resolve(resolve_cases[i].relative); + EXPECT_EQ(resolve_cases[i].expected_valid, output.is_valid()) << i; + EXPECT_EQ(resolve_cases[i].expected, output.spec()) << i; + EXPECT_EQ(output.SchemeIsFileSystem(), output.inner_url() != NULL); + + // Wide code path. + GURL inputw(gurl_base::UTF8ToUTF16(resolve_cases[i].base)); + GURL outputw = + input.Resolve(gurl_base::UTF8ToUTF16(resolve_cases[i].relative)); + EXPECT_EQ(resolve_cases[i].expected_valid, outputw.is_valid()) << i; + EXPECT_EQ(resolve_cases[i].expected, outputw.spec()) << i; + EXPECT_EQ(outputw.SchemeIsFileSystem(), outputw.inner_url() != NULL); + } +} + +TEST(GURLTest, GetOrigin) { + struct TestCase { + const char* input; + const char* expected; + } cases[] = { + {"http://www.google.com", "http://www.google.com/"}, + {"javascript:window.alert(\"hello,world\");", ""}, + {"http://user:pass@www.google.com:21/blah#baz", + "http://www.google.com:21/"}, + {"http://user@www.google.com", "http://www.google.com/"}, + {"http://:pass@www.google.com", "http://www.google.com/"}, + {"http://:@www.google.com", "http://www.google.com/"}, + {"filesystem:http://www.google.com/temp/foo?q#b", + "http://www.google.com/"}, + {"filesystem:http://user:pass@google.com:21/blah#baz", + "http://google.com:21/"}, + {"blob:null/guid-goes-here", ""}, + {"blob:http://origin/guid-goes-here", "" /* should be http://origin/ */}, + }; + for (size_t i = 0; i < gurl_base::size(cases); i++) { + GURL url(cases[i].input); + GURL origin = url.GetOrigin(); + EXPECT_EQ(cases[i].expected, origin.spec()); + } +} + +TEST(GURLTest, GetAsReferrer) { + struct TestCase { + const char* input; + const char* expected; + } cases[] = { + {"http://www.google.com", "http://www.google.com/"}, + {"http://user:pass@www.google.com:21/blah#baz", "http://www.google.com:21/blah"}, + {"http://user@www.google.com", "http://www.google.com/"}, + {"http://:pass@www.google.com", "http://www.google.com/"}, + {"http://:@www.google.com", "http://www.google.com/"}, + {"http://www.google.com/temp/foo?q#b", "http://www.google.com/temp/foo?q"}, + {"not a url", ""}, + {"unknown-scheme://foo.html", ""}, + {"file:///tmp/test.html", ""}, + {"https://www.google.com", "https://www.google.com/"}, + }; + for (size_t i = 0; i < gurl_base::size(cases); i++) { + GURL url(cases[i].input); + GURL origin = url.GetAsReferrer(); + EXPECT_EQ(cases[i].expected, origin.spec()); + } +} + +TEST(GURLTest, GetWithEmptyPath) { + struct TestCase { + const char* input; + const char* expected; + } cases[] = { + {"http://www.google.com", "http://www.google.com/"}, + {"javascript:window.alert(\"hello, world\");", ""}, + {"http://www.google.com/foo/bar.html?baz=22", "http://www.google.com/"}, + {"filesystem:http://www.google.com/temporary/bar.html?baz=22", "filesystem:http://www.google.com/temporary/"}, + {"filesystem:file:///temporary/bar.html?baz=22", "filesystem:file:///temporary/"}, + }; + + for (size_t i = 0; i < gurl_base::size(cases); i++) { + GURL url(cases[i].input); + GURL empty_path = url.GetWithEmptyPath(); + EXPECT_EQ(cases[i].expected, empty_path.spec()); + } +} + +TEST(GURLTest, GetWithoutFilename) { + struct TestCase { + const char* input; + const char* expected; + } cases[] = { + // Common Standard URLs. + {"https://www.google.com", "https://www.google.com/"}, + {"https://www.google.com/", "https://www.google.com/"}, + {"https://www.google.com/maps.htm", "https://www.google.com/"}, + {"https://www.google.com/maps/", "https://www.google.com/maps/"}, + {"https://www.google.com/index.html", "https://www.google.com/"}, + {"https://www.google.com/index.html?q=maps", "https://www.google.com/"}, + {"https://www.google.com/index.html#maps/", "https://www.google.com/"}, + {"https://foo:bar@www.google.com/maps.htm", "https://foo:bar@www.google.com/"}, + {"https://www.google.com/maps/au/index.html", "https://www.google.com/maps/au/"}, + {"https://www.google.com/maps/au/north", "https://www.google.com/maps/au/"}, + {"https://www.google.com/maps/au/north/", "https://www.google.com/maps/au/north/"}, + {"https://www.google.com/maps/au/index.html?q=maps#fragment/", "https://www.google.com/maps/au/"}, + {"http://www.google.com:8000/maps/au/index.html?q=maps#fragment/", "http://www.google.com:8000/maps/au/"}, + {"https://www.google.com/maps/au/north/?q=maps#fragment", "https://www.google.com/maps/au/north/"}, + {"https://www.google.com/maps/au/north?q=maps#fragment", "https://www.google.com/maps/au/"}, + // Less common standard URLs. + {"filesystem:http://www.google.com/temporary/bar.html?baz=22", "filesystem:http://www.google.com/temporary/"}, + {"file:///temporary/bar.html?baz=22","file:///temporary/"}, + {"ftp://foo/test/index.html", "ftp://foo/test/"}, + {"gopher://foo/test/index.html", "gopher://foo/test/"}, + {"ws://foo/test/index.html", "ws://foo/test/"}, + // Non-standard, hierarchical URLs. + {"chrome://foo/bar.html", "chrome://foo/"}, + {"httpa://foo/test/index.html", "httpa://foo/test/"}, + // Non-standard, non-hierarchical URLs. + {"blob:https://foo.bar/test/index.html", ""}, + {"about:blank", ""}, + {"data:foobar", ""}, + {"scheme:opaque_data", ""}, + // Invalid URLs. + {"foobar", ""}, + }; + + for (size_t i = 0; i < gurl_base::size(cases); i++) { + GURL url(cases[i].input); + GURL without_filename = url.GetWithoutFilename(); + EXPECT_EQ(cases[i].expected, without_filename.spec()) << i; + } +} + +TEST(GURLTest, Replacements) { + // The URL canonicalizer replacement test will handle most of these case. + // The most important thing to do here is to check that the proper + // canonicalizer gets called based on the scheme of the input. + struct ReplaceCase { + const char* base; + const char* scheme; + const char* username; + const char* password; + const char* host; + const char* port; + const char* path; + const char* query; + const char* ref; + const char* expected; + } replace_cases[] = { + {"http://www.google.com/foo/bar.html?foo#bar", NULL, NULL, NULL, NULL, + NULL, "/", "", "", "http://www.google.com/"}, + {"http://www.google.com/foo/bar.html?foo#bar", "javascript", "", "", "", + "", "window.open('foo');", "", "", "javascript:window.open('foo');"}, + {"file:///C:/foo/bar.txt", "http", NULL, NULL, "www.google.com", "99", + "/foo", "search", "ref", "http://www.google.com:99/foo?search#ref"}, +#ifdef WIN32 + {"http://www.google.com/foo/bar.html?foo#bar", "file", "", "", "", "", + "c:\\", "", "", "file:///C:/"}, +#endif + {"filesystem:http://www.google.com/foo/bar.html?foo#bar", NULL, NULL, + NULL, NULL, NULL, "/", "", "", "filesystem:http://www.google.com/foo/"}, + // Lengthen the URL instead of shortening it, to test creation of + // inner_url. + {"filesystem:http://www.google.com/foo/", NULL, NULL, NULL, NULL, NULL, + "bar.html", "foo", "bar", + "filesystem:http://www.google.com/foo/bar.html?foo#bar"}, + }; + + for (size_t i = 0; i < gurl_base::size(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + GURL url(cur.base); + GURL::Replacements repl; + SetupReplacement(&GURL::Replacements::SetScheme, &repl, cur.scheme); + SetupReplacement(&GURL::Replacements::SetUsername, &repl, cur.username); + SetupReplacement(&GURL::Replacements::SetPassword, &repl, cur.password); + SetupReplacement(&GURL::Replacements::SetHost, &repl, cur.host); + SetupReplacement(&GURL::Replacements::SetPort, &repl, cur.port); + SetupReplacement(&GURL::Replacements::SetPath, &repl, cur.path); + SetupReplacement(&GURL::Replacements::SetQuery, &repl, cur.query); + SetupReplacement(&GURL::Replacements::SetRef, &repl, cur.ref); + GURL output = url.ReplaceComponents(repl); + + EXPECT_EQ(replace_cases[i].expected, output.spec()); + + EXPECT_EQ(output.SchemeIsFileSystem(), output.inner_url() != NULL); + if (output.SchemeIsFileSystem()) { + // TODO(mmenke): inner_url()->spec() is currently the same as the spec() + // for the GURL itself. This should be fixed. + // See https://crbug.com/619596 + EXPECT_EQ(replace_cases[i].expected, output.inner_url()->spec()); + } + } +} + +TEST(GURLTest, ClearFragmentOnDataUrl) { + // http://crbug.com/291747 - a data URL may legitimately have trailing + // whitespace in the spec after the ref is cleared. Test this does not trigger + // the Parsed importing validation GURL_DCHECK in GURL. + GURL url(" data: one ? two # three "); + + // By default the trailing whitespace will have been stripped. + EXPECT_EQ("data: one ? two # three", url.spec()); + GURL::Replacements repl; + repl.ClearRef(); + GURL url_no_ref = url.ReplaceComponents(repl); + + EXPECT_EQ("data: one ? two ", url_no_ref.spec()); + + // Importing a parsed URL via this constructor overload will retain trailing + // whitespace. + GURL import_url(url_no_ref.spec(), + url_no_ref.parsed_for_possibly_invalid_spec(), + url_no_ref.is_valid()); + EXPECT_EQ(url_no_ref, import_url); + EXPECT_EQ(import_url.query(), " two "); +} + +TEST(GURLTest, PathForRequest) { + struct TestCase { + const char* input; + const char* expected; + const char* inner_expected; + } cases[] = { + {"http://www.google.com", "/", NULL}, + {"http://www.google.com/", "/", NULL}, + {"http://www.google.com/foo/bar.html?baz=22", "/foo/bar.html?baz=22", NULL}, + {"http://www.google.com/foo/bar.html#ref", "/foo/bar.html", NULL}, + {"http://www.google.com/foo/bar.html?query#ref", "/foo/bar.html?query", NULL}, + {"filesystem:http://www.google.com/temporary/foo/bar.html?query#ref", "/foo/bar.html?query", "/temporary"}, + {"filesystem:http://www.google.com/temporary/foo/bar.html?query", "/foo/bar.html?query", "/temporary"}, + }; + + for (size_t i = 0; i < gurl_base::size(cases); i++) { + GURL url(cases[i].input); + std::string path_request = url.PathForRequest(); + EXPECT_EQ(cases[i].expected, path_request); + EXPECT_EQ(cases[i].inner_expected == NULL, url.inner_url() == NULL); + if (url.inner_url() && cases[i].inner_expected) + EXPECT_EQ(cases[i].inner_expected, url.inner_url()->PathForRequest()); + } +} + +TEST(GURLTest, EffectiveIntPort) { + struct PortTest { + const char* spec; + int expected_int_port; + } port_tests[] = { + // http + {"http://www.google.com/", 80}, + {"http://www.google.com:80/", 80}, + {"http://www.google.com:443/", 443}, + + // https + {"https://www.google.com/", 443}, + {"https://www.google.com:443/", 443}, + {"https://www.google.com:80/", 80}, + + // ftp + {"ftp://www.google.com/", 21}, + {"ftp://www.google.com:21/", 21}, + {"ftp://www.google.com:80/", 80}, + + // gopher + {"gopher://www.google.com/", 70}, + {"gopher://www.google.com:70/", 70}, + {"gopher://www.google.com:80/", 80}, + + // file - no port + {"file://www.google.com/", PORT_UNSPECIFIED}, + {"file://www.google.com:443/", PORT_UNSPECIFIED}, + + // data - no port + {"data:www.google.com:90", PORT_UNSPECIFIED}, + {"data:www.google.com", PORT_UNSPECIFIED}, + + // filesystem - no port + {"filesystem:http://www.google.com:90/t/foo", PORT_UNSPECIFIED}, + {"filesystem:file:///t/foo", PORT_UNSPECIFIED}, + }; + + for (size_t i = 0; i < gurl_base::size(port_tests); i++) { + GURL url(port_tests[i].spec); + EXPECT_EQ(port_tests[i].expected_int_port, url.EffectiveIntPort()); + } +} + +TEST(GURLTest, IPAddress) { + struct IPTest { + const char* spec; + bool expected_ip; + } ip_tests[] = { + {"http://www.google.com/", false}, + {"http://192.168.9.1/", true}, + {"http://192.168.9.1.2/", false}, + {"http://192.168.m.1/", false}, + {"http://2001:db8::1/", false}, + {"http://[2001:db8::1]/", true}, + {"", false}, + {"some random input!", false}, + }; + + for (size_t i = 0; i < gurl_base::size(ip_tests); i++) { + GURL url(ip_tests[i].spec); + EXPECT_EQ(ip_tests[i].expected_ip, url.HostIsIPAddress()); + } +} + +TEST(GURLTest, HostNoBrackets) { + struct TestCase { + const char* input; + const char* expected_host; + const char* expected_plainhost; + } cases[] = { + {"http://www.google.com", "www.google.com", "www.google.com"}, + {"http://[2001:db8::1]/", "[2001:db8::1]", "2001:db8::1"}, + {"http://[::]/", "[::]", "::"}, + + // Don't require a valid URL, but don't crash either. + {"http://[]/", "[]", ""}, + {"http://[x]/", "[x]", "x"}, + {"http://[x/", "[x", "[x"}, + {"http://x]/", "x]", "x]"}, + {"http://[/", "[", "["}, + {"http://]/", "]", "]"}, + {"", "", ""}, + }; + for (size_t i = 0; i < gurl_base::size(cases); i++) { + GURL url(cases[i].input); + EXPECT_EQ(cases[i].expected_host, url.host()); + EXPECT_EQ(cases[i].expected_plainhost, url.HostNoBrackets()); + EXPECT_EQ(cases[i].expected_plainhost, url.HostNoBracketsPiece()); + } +} + +TEST(GURLTest, DomainIs) { + GURL url_1("http://google.com/foo"); + EXPECT_TRUE(url_1.DomainIs("google.com")); + + // Subdomain and port are ignored. + GURL url_2("http://www.google.com:99/foo"); + EXPECT_TRUE(url_2.DomainIs("google.com")); + + // Different top-level domain. + GURL url_3("http://www.google.com.cn/foo"); + EXPECT_FALSE(url_3.DomainIs("google.com")); + + // Different host name. + GURL url_4("http://www.iamnotgoogle.com/foo"); + EXPECT_FALSE(url_4.DomainIs("google.com")); + + // The input must be lower-cased otherwise DomainIs returns false. + GURL url_5("http://www.google.com/foo"); + EXPECT_FALSE(url_5.DomainIs("Google.com")); + + // If the URL is invalid, DomainIs returns false. + GURL invalid_url("google.com"); + EXPECT_FALSE(invalid_url.is_valid()); + EXPECT_FALSE(invalid_url.DomainIs("google.com")); + + GURL url_with_escape_chars("https://www.,.test"); + EXPECT_TRUE(url_with_escape_chars.is_valid()); + EXPECT_EQ(url_with_escape_chars.host(), "www.%2C.test"); + EXPECT_TRUE(url_with_escape_chars.DomainIs("%2C.test")); +} + +TEST(GURLTest, DomainIsTerminatingDotBehavior) { + // If the host part ends with a dot, it matches input domains + // with or without a dot. + GURL url_with_dot("http://www.google.com./foo"); + EXPECT_TRUE(url_with_dot.DomainIs("google.com")); + EXPECT_TRUE(url_with_dot.DomainIs("google.com.")); + EXPECT_TRUE(url_with_dot.DomainIs(".com")); + EXPECT_TRUE(url_with_dot.DomainIs(".com.")); + + // But, if the host name doesn't end with a dot and the input + // domain does, then it's considered to not match. + GURL url_without_dot("http://google.com/foo"); + EXPECT_FALSE(url_without_dot.DomainIs("google.com.")); + + // If the URL ends with two dots, it doesn't match. + GURL url_with_two_dots("http://www.google.com../foo"); + EXPECT_FALSE(url_with_two_dots.DomainIs("google.com")); +} + +TEST(GURLTest, DomainIsWithFilesystemScheme) { + GURL url_1("filesystem:http://www.google.com:99/foo/"); + EXPECT_TRUE(url_1.DomainIs("google.com")); + + GURL url_2("filesystem:http://www.iamnotgoogle.com/foo/"); + EXPECT_FALSE(url_2.DomainIs("google.com")); +} + +// Newlines should be stripped from inputs. +TEST(GURLTest, Newlines) { + // Constructor. + GURL url_1(" \t ht\ntp://\twww.goo\rgle.com/as\ndf \n "); + EXPECT_EQ("http://www.google.com/asdf", url_1.spec()); + EXPECT_FALSE( + url_1.parsed_for_possibly_invalid_spec().potentially_dangling_markup); + + // Relative path resolver. + GURL url_2 = url_1.Resolve(" \n /fo\to\r "); + EXPECT_EQ("http://www.google.com/foo", url_2.spec()); + EXPECT_FALSE( + url_2.parsed_for_possibly_invalid_spec().potentially_dangling_markup); + + // Constructor. + GURL url_3(" \t ht\ntp://\twww.goo\rgle.com/as\ndf< \n "); + EXPECT_EQ("http://www.google.com/asdf%3C", url_3.spec()); + EXPECT_TRUE( + url_3.parsed_for_possibly_invalid_spec().potentially_dangling_markup); + + // Relative path resolver. + GURL url_4 = url_1.Resolve(" \n /fo\to<\r "); + EXPECT_EQ("http://www.google.com/foo%3C", url_4.spec()); + EXPECT_TRUE( + url_4.parsed_for_possibly_invalid_spec().potentially_dangling_markup); + + // Note that newlines are NOT stripped from ReplaceComponents. +} + +TEST(GURLTest, IsStandard) { + GURL a("http:foo/bar"); + EXPECT_TRUE(a.IsStandard()); + + GURL b("foo:bar/baz"); + EXPECT_FALSE(b.IsStandard()); + + GURL c("foo://bar/baz"); + EXPECT_FALSE(c.IsStandard()); + + GURL d("cid:bar@baz"); + EXPECT_FALSE(d.IsStandard()); +} + +TEST(GURLTest, SchemeIsHTTPOrHTTPS) { + EXPECT_TRUE(GURL("http://bar/").SchemeIsHTTPOrHTTPS()); + EXPECT_TRUE(GURL("HTTPS://BAR").SchemeIsHTTPOrHTTPS()); + EXPECT_FALSE(GURL("ftp://bar/").SchemeIsHTTPOrHTTPS()); +} + +TEST(GURLTest, SchemeIsWSOrWSS) { + EXPECT_TRUE(GURL("WS://BAR/").SchemeIsWSOrWSS()); + EXPECT_TRUE(GURL("wss://bar/").SchemeIsWSOrWSS()); + EXPECT_FALSE(GURL("http://bar/").SchemeIsWSOrWSS()); +} + +TEST(GURLTest, SchemeIsCryptographic) { + EXPECT_TRUE(GURL("https://foo.bar.com/").SchemeIsCryptographic()); + EXPECT_TRUE(GURL("HTTPS://foo.bar.com/").SchemeIsCryptographic()); + EXPECT_TRUE(GURL("HtTpS://foo.bar.com/").SchemeIsCryptographic()); + + EXPECT_TRUE(GURL("wss://foo.bar.com/").SchemeIsCryptographic()); + EXPECT_TRUE(GURL("WSS://foo.bar.com/").SchemeIsCryptographic()); + EXPECT_TRUE(GURL("WsS://foo.bar.com/").SchemeIsCryptographic()); + + EXPECT_FALSE(GURL("http://foo.bar.com/").SchemeIsCryptographic()); + EXPECT_FALSE(GURL("ws://foo.bar.com/").SchemeIsCryptographic()); +} + +TEST(GURLTest, SchemeIsCryptographicStatic) { + EXPECT_TRUE(GURL::SchemeIsCryptographic("https")); + EXPECT_TRUE(GURL::SchemeIsCryptographic("wss")); + EXPECT_FALSE(GURL::SchemeIsCryptographic("http")); + EXPECT_FALSE(GURL::SchemeIsCryptographic("ws")); + EXPECT_FALSE(GURL::SchemeIsCryptographic("ftp")); +} + +TEST(GURLTest, SchemeIsBlob) { + EXPECT_TRUE(GURL("BLOB://BAR/").SchemeIsBlob()); + EXPECT_TRUE(GURL("blob://bar/").SchemeIsBlob()); + EXPECT_FALSE(GURL("http://bar/").SchemeIsBlob()); +} + +// Tests that the 'content' of the URL is properly extracted. This can be +// complex in cases such as multiple schemes (view-source:http:) or for +// javascript URLs. See GURL::GetContent for more details. +TEST(GURLTest, ContentForNonStandardURLs) { + struct TestCase { + const char* url; + const char* expected; + } cases[] = { + {"null", ""}, + {"not-a-standard-scheme:this is arbitrary content", + "this is arbitrary content"}, + + // When there are multiple schemes, only the first is excluded from the + // content. Note also that for e.g. 'http://', the '//' is part of the + // content not the scheme. + {"view-source:http://example.com/path", "http://example.com/path"}, + {"blob:http://example.com/GUID", "http://example.com/GUID"}, + {"blob://http://example.com/GUID", "//http://example.com/GUID"}, + {"blob:http://user:password@example.com/GUID", + "http://user:password@example.com/GUID"}, + + // The octothorpe character ('#') marks the end of the URL content, and + // the start of the fragment. It should not be included in the content. + {"http://www.example.com/GUID#ref", "www.example.com/GUID"}, + {"http://me:secret@example.com/GUID/#ref", "me:secret@example.com/GUID/"}, + {"data:text/html,Question?<div style=\"color: #bad\">idea</div>", + "text/html,Question?<div style=\"color: "}, + + // TODO(mkwst): This seems like a bug. https://crbug.com/513600 + {"filesystem:http://example.com/path", "/"}, + + // Javascript URLs include '#' symbols in their content. + {"javascript:#", "#"}, + {"javascript:alert('#');", "alert('#');"}, + }; + + for (const auto& test : cases) { + GURL url(test.url); + EXPECT_EQ(test.expected, url.GetContent()) << test.url; + } +} + +// Tests that the URL path is properly extracted for unusual URLs. This can be +// complex in cases such as multiple schemes (view-source:http:) or when +// octothorpes ('#') are involved. +TEST(GURLTest, PathForNonStandardURLs) { + struct TestCase { + const char* url; + const char* expected; + } cases[] = { + {"null", ""}, + {"not-a-standard-scheme:this is arbitrary content", + "this is arbitrary content"}, + {"view-source:http://example.com/path", "http://example.com/path"}, + {"blob:http://example.com/GUID", "http://example.com/GUID"}, + {"blob://http://example.com/GUID", "//http://example.com/GUID"}, + {"blob:http://user:password@example.com/GUID", + "http://user:password@example.com/GUID"}, + + {"http://www.example.com/GUID#ref", "/GUID"}, + {"http://me:secret@example.com/GUID/#ref", "/GUID/"}, + {"data:text/html,Question?<div style=\"color: #bad\">idea</div>", + "text/html,Question"}, + + // TODO(mkwst): This seems like a bug. https://crbug.com/513600 + {"filesystem:http://example.com/path", "/"}, + }; + + for (const auto& test : cases) { + GURL url(test.url); + EXPECT_EQ(test.expected, url.path()) << test.url; + } +} + +TEST(GURLTest, IsAboutBlank) { + const std::string kAboutBlankUrls[] = {"about:blank", "about:blank?foo", + "about:blank/#foo", + "about:blank?foo#foo"}; + for (const auto& url : kAboutBlankUrls) + EXPECT_TRUE(GURL(url).IsAboutBlank()) << url; + + const std::string kNotAboutBlankUrls[] = { + "http:blank", "about:blan", "about://blank", + "about:blank/foo", "about://:8000/blank", "about://foo:foo@/blank", + "foo@about:blank", "foo:bar@about:blank", "about:blank:8000", + "about:blANk"}; + for (const auto& url : kNotAboutBlankUrls) + EXPECT_FALSE(GURL(url).IsAboutBlank()) << url; +} + +TEST(GURLTest, IsAboutSrcdoc) { + const std::string kAboutSrcdocUrls[] = { + "about:srcdoc", "about:srcdoc/", "about:srcdoc?foo", "about:srcdoc/#foo", + "about:srcdoc?foo#foo"}; + for (const auto& url : kAboutSrcdocUrls) + EXPECT_TRUE(GURL(url).IsAboutSrcdoc()) << url; + + const std::string kNotAboutSrcdocUrls[] = {"http:srcdoc", + "about:srcdo", + "about://srcdoc", + "about://srcdoc\\", + "about:srcdoc/foo", + "about://:8000/srcdoc", + "about://foo:foo@/srcdoc", + "foo@about:srcdoc", + "foo:bar@about:srcdoc", + "about:srcdoc:8000", + "about:srCDOc"}; + for (const auto& url : kNotAboutSrcdocUrls) + EXPECT_FALSE(GURL(url).IsAboutSrcdoc()) << url; +} + +TEST(GURLTest, EqualsIgnoringRef) { + const struct { + const char* url_a; + const char* url_b; + bool are_equals; + } kTestCases[] = { + // No ref. + {"http://a.com", "http://a.com", true}, + {"http://a.com", "http://b.com", false}, + + // Same Ref. + {"http://a.com#foo", "http://a.com#foo", true}, + {"http://a.com#foo", "http://b.com#foo", false}, + + // Different Refs. + {"http://a.com#foo", "http://a.com#bar", true}, + {"http://a.com#foo", "http://b.com#bar", false}, + + // One has a ref, the other doesn't. + {"http://a.com#foo", "http://a.com", true}, + {"http://a.com#foo", "http://b.com", false}, + + // Empty refs. + {"http://a.com#", "http://a.com#", true}, + {"http://a.com#", "http://a.com", true}, + + // URLs that differ only by their last character. + {"http://aaa", "http://aab", false}, + {"http://aaa#foo", "http://aab#foo", false}, + + // Different size of the part before the ref. + {"http://123#a", "http://123456#a", false}, + + // Blob URLs + {"blob:http://a.com#foo", "blob:http://a.com#foo", true}, + {"blob:http://a.com#foo", "blob:http://a.com#bar", true}, + {"blob:http://a.com#foo", "blob:http://b.com#bar", false}, + + // Filesystem URLs + {"filesystem:http://a.com#foo", "filesystem:http://a.com#foo", true}, + {"filesystem:http://a.com#foo", "filesystem:http://a.com#bar", true}, + {"filesystem:http://a.com#foo", "filesystem:http://b.com#bar", false}, + + // Data URLs + {"data:text/html,a#foo", "data:text/html,a#bar", true}, + {"data:text/html,a#foo", "data:text/html,a#foo", true}, + {"data:text/html,a#foo", "data:text/html,b#foo", false}, + }; + + for (const auto& test_case : kTestCases) { + SCOPED_TRACE(testing::Message() + << std::endl + << "url_a = " << test_case.url_a << std::endl + << "url_b = " << test_case.url_b << std::endl); + // A versus B. + EXPECT_EQ(test_case.are_equals, + GURL(test_case.url_a).EqualsIgnoringRef(GURL(test_case.url_b))); + // B versus A. + EXPECT_EQ(test_case.are_equals, + GURL(test_case.url_b).EqualsIgnoringRef(GURL(test_case.url_a))); + } +} + +TEST(GURLTest, DebugAlias) { + GURL url("https://foo.com/bar"); + DEBUG_ALIAS_FOR_GURL(url_debug_alias, url); + EXPECT_STREQ("https://foo.com/bar", url_debug_alias); +} + +} // namespace url
diff --git a/url/origin.cc b/url/origin.cc new file mode 100644 index 0000000..6eda15e --- /dev/null +++ b/url/origin.cc
@@ -0,0 +1,354 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/origin.h" + +#include <stdint.h> + +#include <algorithm> + +#include "polyfills/base/logging.h" +#include "base/stl_util.h" +#include "base/strings/strcat.h" +#include "base/strings/string_number_conversions.h" +#include "base/strings/string_util.h" +#include "url/gurl.h" +#include "url/url_canon.h" +#include "url/url_canon_stdstring.h" +#include "url/url_constants.h" +#include "url/url_util.h" + +namespace url { + +Origin::Origin() : nonce_(Nonce()) {} + +Origin Origin::Create(const GURL& url) { + if (!url.is_valid()) + return Origin(); + + SchemeHostPort tuple; + + if (url.SchemeIsFileSystem()) { + tuple = SchemeHostPort(*url.inner_url()); + } else if (url.SchemeIsBlob()) { + // If we're dealing with a 'blob:' URL, https://url.spec.whatwg.org/#origin + // defines the origin as the origin of the URL which results from parsing + // the "path", which boils down to everything after the scheme. GURL's + // 'GetContent()' gives us exactly that. + tuple = SchemeHostPort(GURL(url.GetContent())); + } else { + tuple = SchemeHostPort(url); + + // It's SchemeHostPort's responsibility to filter out unrecognized schemes; + // sanity check that this is happening. + GURL_DCHECK(tuple.IsInvalid() || url.IsStandard() || + gurl_base::Contains(GetLocalSchemes(), url.scheme_piece()) || + AllowNonStandardSchemesForAndroidWebView()); + } + + if (tuple.IsInvalid()) + return Origin(); + return Origin(std::move(tuple)); +} + +Origin Origin::Resolve(const GURL& url, const Origin& base_origin) { + if (url.SchemeIs(kAboutScheme)) + return base_origin; + Origin result = Origin::Create(url); + if (!result.opaque()) + return result; + return base_origin.DeriveNewOpaqueOrigin(); +} + +Origin::Origin(const Origin& other) = default; +Origin& Origin::operator=(const Origin& other) = default; +Origin::Origin(Origin&& other) = default; +Origin& Origin::operator=(Origin&& other) = default; +Origin::~Origin() = default; + +// static +gurl_base::Optional<Origin> Origin::UnsafelyCreateTupleOriginWithoutNormalization( + gurl_base::StringPiece scheme, + gurl_base::StringPiece host, + uint16_t port) { + SchemeHostPort tuple(scheme.as_string(), host.as_string(), port, + SchemeHostPort::CHECK_CANONICALIZATION); + if (tuple.IsInvalid()) + return gurl_base::nullopt; + return Origin(std::move(tuple)); +} + +// static +gurl_base::Optional<Origin> Origin::UnsafelyCreateOpaqueOriginWithoutNormalization( + gurl_base::StringPiece precursor_scheme, + gurl_base::StringPiece precursor_host, + uint16_t precursor_port, + const Origin::Nonce& nonce) { + SchemeHostPort precursor(precursor_scheme.as_string(), + precursor_host.as_string(), precursor_port, + SchemeHostPort::CHECK_CANONICALIZATION); + // For opaque origins, it is okay for the SchemeHostPort to be invalid; + // however, this should only arise when the arguments indicate the + // canonical representation of the invalid SchemeHostPort. + if (precursor.IsInvalid() && + !(precursor_scheme.empty() && precursor_host.empty() && + precursor_port == 0)) { + return gurl_base::nullopt; + } + return Origin(std::move(nonce), std::move(precursor)); +} + +// static +Origin Origin::CreateFromNormalizedTuple(std::string scheme, + std::string host, + uint16_t port) { + SchemeHostPort tuple(std::move(scheme), std::move(host), port, + SchemeHostPort::ALREADY_CANONICALIZED); + if (tuple.IsInvalid()) + return Origin(); + return Origin(std::move(tuple)); +} + +// static +Origin Origin::CreateOpaqueFromNormalizedPrecursorTuple( + std::string precursor_scheme, + std::string precursor_host, + uint16_t precursor_port, + const Origin::Nonce& nonce) { + SchemeHostPort precursor(std::move(precursor_scheme), + std::move(precursor_host), precursor_port, + SchemeHostPort::ALREADY_CANONICALIZED); + // For opaque origins, it is okay for the SchemeHostPort to be invalid. + return Origin(std::move(nonce), std::move(precursor)); +} + +std::string Origin::Serialize() const { + if (opaque()) + return "null"; + + if (scheme() == kFileScheme) + return "file://"; + + return tuple_.Serialize(); +} + +GURL Origin::GetURL() const { + if (opaque()) + return GURL(); + + if (scheme() == kFileScheme) + return GURL("file:///"); + + return tuple_.GetURL(); +} + +gurl_base::Optional<gurl_base::UnguessableToken> Origin::GetNonceForSerialization() + const { + // TODO(nasko): Consider not making a copy here, but return a reference to + // the nonce. + return nonce_ ? gurl_base::make_optional(nonce_->token()) : gurl_base::nullopt; +} + +bool Origin::IsSameOriginWith(const Origin& other) const { + // scheme/host/port must match, even for opaque origins where |tuple_| holds + // the precursor origin. + return std::tie(tuple_, nonce_) == std::tie(other.tuple_, other.nonce_); +} + +bool Origin::CanBeDerivedFrom(const GURL& url) const { + GURL_DCHECK(url.is_valid()); + + // For "no access" schemes, blink's SecurityOrigin will always create an + // opaque unique one. However, about: scheme is also registered as such but + // does not behave this way, therefore exclude it from this check. + if (gurl_base::Contains(url::GetNoAccessSchemes(), url.scheme()) && + !url.SchemeIs(kAboutScheme)) { + // If |this| is not opaque, definitely return false as the expectation + // is for opaque origin. + if (!opaque()) + return false; + + // And if it is unique opaque origin, it definitely is fine. But if there + // is a precursor stored, we should fall through to compare the tuples. + if (tuple_.IsInvalid()) + return true; + } + + SchemeHostPort url_tuple; + + // Optimization for the common, success case: Scheme/Host/Port match on the + // precursor, and the URL is standard. Opaqueness does not matter as a tuple + // origin can always create an opaque tuple origin. + if (url.IsStandard()) { + // Note: if extra copies of the scheme and host are undesirable, this check + // can be implemented using StringPiece comparisons, but it has to account + // explicitly checks on port numbers. + if (url.SchemeIsFileSystem()) { + url_tuple = SchemeHostPort(*url.inner_url()); + } else { + url_tuple = SchemeHostPort(url); + } + return url_tuple == tuple_; + + // Blob URLs still contain an inner origin, however it is not accessible + // through inner_url(), therefore it requires specific case to handle it. + } else if (url.SchemeIsBlob()) { + // If |this| doesn't contain any precursor information, it is an unique + // opaque origin. It is valid case, as any browser-initiated navigation + // to about:blank or data: URL will result in a document with such + // origin and it is valid for it to create blob: URLs. + if (tuple_.IsInvalid()) + return true; + + url_tuple = SchemeHostPort(GURL(url.GetContent())); + return url_tuple == tuple_; + } + + // At this point, the URL has non-standard scheme. + GURL_DCHECK(!url.IsStandard()); + + // All about: URLs (about:blank, about:srcdoc) inherit their origin from + // the context which navigated them, which means that they can be in any + // type of origin. + if (url.SchemeIs(kAboutScheme)) + return true; + + // All data: URLs commit in opaque origins, therefore |this| must be opaque + // if |url| has data: scheme. + if (url.SchemeIs(kDataScheme)) + return opaque(); + + // If |this| does not have valid precursor tuple, it is unique opaque origin, + // which is what we expect non-standard schemes to get. + if (tuple_.IsInvalid()) + return true; + + // However, when there is precursor present, the schemes must match. + return url.scheme() == tuple_.scheme(); +} + +bool Origin::DomainIs(gurl_base::StringPiece canonical_domain) const { + return !opaque() && url::DomainIs(tuple_.host(), canonical_domain); +} + +bool Origin::operator<(const Origin& other) const { + return std::tie(tuple_, nonce_) < std::tie(other.tuple_, other.nonce_); +} + +Origin Origin::DeriveNewOpaqueOrigin() const { + return Origin(Nonce(), tuple_); +} + +std::string Origin::GetDebugString() const { + // Handle non-opaque origins first, as they are simpler. + if (!opaque()) { + std::string out = Serialize(); + if (scheme() == kFileScheme) + gurl_base::StrAppend(&out, {" [internally: ", tuple_.Serialize(), "]"}); + return out; + } + + // For opaque origins, log the nonce and precursor as well. Without this, + // EXPECT_EQ failures between opaque origins are nearly impossible to + // understand. + std::string nonce = nonce_->raw_token().is_empty() + ? std::string("nonce TBD") + : nonce_->raw_token().ToString(); + + std::string out = gurl_base::StrCat({Serialize(), " [internally: (", nonce, ")"}); + if (tuple_.IsInvalid()) + gurl_base::StrAppend(&out, {" anonymous]"}); + else + gurl_base::StrAppend(&out, {" derived from ", tuple_.Serialize(), "]"}); + return out; +} + +Origin::Origin(SchemeHostPort tuple) : tuple_(std::move(tuple)) { + GURL_DCHECK(!opaque()); + GURL_DCHECK(!tuple_.IsInvalid()); +} + +// Constructs an opaque origin derived from |precursor|. +Origin::Origin(const Nonce& nonce, SchemeHostPort precursor) + : tuple_(std::move(precursor)), nonce_(std::move(nonce)) { + GURL_DCHECK(opaque()); + // |precursor| is retained, but not accessible via scheme()/host()/port(). + GURL_DCHECK_EQ("", scheme()); + GURL_DCHECK_EQ("", host()); + GURL_DCHECK_EQ(0U, port()); +} + +std::ostream& operator<<(std::ostream& out, const url::Origin& origin) { + out << origin.GetDebugString(); + return out; +} + +std::ostream& operator<<(std::ostream& out, const url::Origin::Nonce& nonce) { + // Subtle: don't let logging trigger lazy-generation of the token value. + if (nonce.raw_token().is_empty()) + return (out << "(nonce TBD)"); + else + return (out << nonce.raw_token()); +} + +bool IsSameOriginWith(const GURL& a, const GURL& b) { + return Origin::Create(a).IsSameOriginWith(Origin::Create(b)); +} + +Origin::Nonce::Nonce() {} +Origin::Nonce::Nonce(const gurl_base::UnguessableToken& token) : token_(token) { + GURL_CHECK(!token_.is_empty()); +} + +const gurl_base::UnguessableToken& Origin::Nonce::token() const { + // Inspecting the value of a nonce triggers lazy-generation. + // TODO(dcheng): UnguessableToken::is_empty should go away -- what sentinel + // value to use instead? + if (token_.is_empty()) + token_ = gurl_base::UnguessableToken::Create(); + return token_; +} + +const gurl_base::UnguessableToken& Origin::Nonce::raw_token() const { + return token_; +} + +// Copying a Nonce triggers lazy-generation of the token. +Origin::Nonce::Nonce(const Origin::Nonce& other) : token_(other.token()) {} + +Origin::Nonce& Origin::Nonce::operator=(const Origin::Nonce& other) { + // Copying a Nonce triggers lazy-generation of the token. + token_ = other.token(); + return *this; +} + +// Moving a nonce does NOT trigger lazy-generation of the token. +Origin::Nonce::Nonce(Origin::Nonce&& other) : token_(other.token_) { + other.token_ = gurl_base::UnguessableToken(); // Reset |other|. +} + +Origin::Nonce& Origin::Nonce::operator=(Origin::Nonce&& other) { + token_ = other.token_; + other.token_ = gurl_base::UnguessableToken(); // Reset |other|. + return *this; +} + +bool Origin::Nonce::operator<(const Origin::Nonce& other) const { + // When comparing, lazy-generation is required of both tokens, so that an + // ordering is established. + return token() < other.token(); +} + +bool Origin::Nonce::operator==(const Origin::Nonce& other) const { + // Equality testing doesn't actually require that the tokens be generated. + // If the tokens are both zero, equality only holds if they're the same + // object. + return (other.token_ == token_) && !(token_.is_empty() && (&other != this)); +} + +bool Origin::Nonce::operator!=(const Origin::Nonce& other) const { + return !(*this == other); +} + +} // namespace url
diff --git a/url/origin.h b/url/origin.h new file mode 100644 index 0000000..58c9221 --- /dev/null +++ b/url/origin.h
@@ -0,0 +1,393 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_ORIGIN_H_ +#define URL_ORIGIN_H_ + +#include <stdint.h> + +#include <string> + +#include "polyfills/base/component_export.h" +#include "polyfills/base/debug/alias.h" +#include "base/optional.h" +#include "base/strings/string16.h" +#include "base/strings/string_piece.h" +#include "base/strings/string_util.h" +#include "base/unguessable_token.h" +#include "ipc/ipc_param_traits.h" +#include "url/scheme_host_port.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon.h" +#include "url/url_constants.h" + +class GURL; + +namespace blink { +class SecurityOrigin; +} // namespace blink + +namespace ipc_fuzzer { +template <class T> +struct FuzzTraits; +} // namespace ipc_fuzzer + +namespace mojo { +template <typename DataViewType, typename T> +struct StructTraits; +struct UrlOriginAdapter; +} // namespace mojo + +namespace url { + +namespace mojom { +class OriginDataView; +} // namespace mojom + +// Per https://html.spec.whatwg.org/multipage/origin.html#origin, an origin is +// either: +// - a tuple origin of (scheme, host, port) as described in RFC 6454. +// - an opaque origin with an internal value, and a memory of the tuple origin +// from which it was derived. +// +// TL;DR: If you need to make a security-relevant decision, use 'url::Origin'. +// If you only need to extract the bits of a URL which are relevant for a +// network connection, use 'url::SchemeHostPort'. +// +// STL;SDR: If you aren't making actual network connections, use 'url::Origin'. +// +// This class ought to be used when code needs to determine if two resources +// are "same-origin", and when a canonical serialization of an origin is +// required. Note that the canonical serialization of an origin *must not* be +// used to determine if two resources are same-origin. +// +// A tuple origin, like 'SchemeHostPort', is composed of a tuple of (scheme, +// host, port), but contains a number of additional concepts which make it +// appropriate for use as a security boundary and access control mechanism +// between contexts. Two tuple origins are same-origin if the tuples are equal. +// A tuple origin may also be re-created from its serialization. +// +// An opaque origin has an internal globally unique identifier. When creating a +// new opaque origin from a URL, a fresh globally unique identifier is +// generated. However, if an opaque origin is copied or moved, the internal +// globally unique identifier is preserved. Two opaque origins are same-origin +// iff the globally unique identifiers match. Unlike tuple origins, an opaque +// origin cannot be re-created from its serialization, which is always the +// string "null". +// +// IMPORTANT: Since opaque origins always serialize as the string "null", it is +// *never* safe to use the serialization for security checks! +// +// A tuple origin and an opaque origin are never same-origin. +// +// There are a few subtleties to note: +// +// * A default constructed Origin is opaque, with no precursor origin. +// +// * Invalid and non-standard GURLs are parsed as opaque origins. This includes +// non-hierarchical URLs like 'data:text/html,...' and 'javascript:alert(1)'. +// +// * GURLs with schemes of 'filesystem' or 'blob' parse the origin out of the +// internals of the URL. That is, 'filesystem:https://example.com/temporary/f' +// is parsed as ('https', 'example.com', 443). +// +// * GURLs with a 'file' scheme are tricky. They are parsed as ('file', '', 0), +// but their behavior may differ from embedder to embedder. +// TODO(dcheng): This behavior is not consistent with Blink's notion of file +// URLs, which always creates an opaque origin. +// +// * The host component of an IPv6 address includes brackets, just like the URL +// representation. +// +// Usage: +// +// * Origins are generally constructed from an already-canonicalized GURL: +// +// GURL url("https://example.com/"); +// url::Origin origin = Origin::Create(url); +// origin.scheme(); // "https" +// origin.host(); // "example.com" +// origin.port(); // 443 +// origin.opaque(); // false +// +// * To answer the question "Are |this| and |that| "same-origin" with each +// other?", use |Origin::IsSameOriginWith|: +// +// if (this.IsSameOriginWith(that)) { +// // Amazingness goes here. +// } +class COMPONENT_EXPORT(URL) Origin { + public: + // Creates an opaque Origin with a nonce that is different from all previously + // existing origins. + Origin(); + + // Creates an Origin from |url|, as described at + // https://url.spec.whatwg.org/#origin, with the following additions: + // + // 1. If |url| is invalid or non-standard, an opaque Origin is constructed. + // 2. 'filesystem' URLs behave as 'blob' URLs (that is, the origin is parsed + // out of everything in the URL which follows the scheme). + // 3. 'file' URLs all parse as ("file", "", 0). + static Origin Create(const GURL& url); + + // Creates an Origin for the resource |url| as if it were requested + // from the context of |base_origin|. If |url| is standard + // (in the sense that it embeds a complete origin, like http/https), + // this returns the same value as would Create(). + // + // If |url| is "about:blank", this returns a copy of |base_origin|. + // + // Otherwise, returns a new opaque origin derived from |base_origin|. + // In this case, the resulting opaque origin will inherit the tuple + // (or precursor tuple) of |base_origin|, but will not be same origin + // with |base_origin|, even if |base_origin| is already opaque. + static Origin Resolve(const GURL& url, const Origin& base_origin); + + // Copyable and movable. + Origin(const Origin&); + Origin& operator=(const Origin&); + Origin(Origin&&); + Origin& operator=(Origin&&); + + // Creates an Origin from a |scheme|, |host|, and |port|. All the parameters + // must be valid and canonicalized. Returns nullopt if any parameter is not + // canonical, or if all the parameters are empty. + // + // This constructor should be used in order to pass 'Origin' objects back and + // forth over IPC (as transitioning through GURL would risk potentially + // dangerous recanonicalization); other potential callers should prefer the + // 'GURL'-based constructor. + static gurl_base::Optional<Origin> UnsafelyCreateTupleOriginWithoutNormalization( + gurl_base::StringPiece scheme, + gurl_base::StringPiece host, + uint16_t port); + + // Creates an origin without sanity checking that the host is canonicalized. + // This should only be used when converting between already normalized types, + // and should NOT be used for IPC. Method takes std::strings for use with move + // operators to avoid copies. + static Origin CreateFromNormalizedTuple(std::string scheme, + std::string host, + uint16_t port); + + ~Origin(); + + // For opaque origins, these return ("", "", 0). + const std::string& scheme() const { + return !opaque() ? tuple_.scheme() : gurl_base::EmptyString(); + } + const std::string& host() const { + return !opaque() ? tuple_.host() : gurl_base::EmptyString(); + } + uint16_t port() const { return !opaque() ? tuple_.port() : 0; } + + bool opaque() const { return nonce_.has_value(); } + + // An ASCII serialization of the Origin as per Section 6.2 of RFC 6454, with + // the addition that all Origins with a 'file' scheme serialize to "file://". + std::string Serialize() const; + + // Two non-opaque Origins are "same-origin" if their schemes, hosts, and ports + // are exact matches. Two opaque origins are same-origin only if their + // internal nonce values match. A non-opaque origin is never same-origin with + // an opaque origin. + bool IsSameOriginWith(const Origin& other) const; + bool operator==(const Origin& other) const { return IsSameOriginWith(other); } + bool operator!=(const Origin& other) const { + return !IsSameOriginWith(other); + } + + // This method returns true for any |url| which if navigated to could result + // in an origin compatible with |this|. + bool CanBeDerivedFrom(const GURL& url) const; + + // Get the scheme, host, and port from which this origin derives. For + // a tuple Origin, this gives the same values as calling scheme(), host() + // and port(). For an opaque Origin that was created by calling + // Origin::DeriveNewOpaqueOrigin() on a precursor or Origin::Resolve(), + // this returns the tuple inherited from the precursor. + // + // If this Origin is opaque and was created via the default constructor or + // Origin::Create(), the precursor origin is unknown. + // + // Use with great caution: opaque origins should generally not inherit + // privileges from the origins they derive from. However, in some cases + // (such as restrictions on process placement, or determining the http lock + // icon) this information may be relevant to ensure that entering an + // opaque origin does not grant privileges initially denied to the original + // non-opaque origin. + // + // This method has a deliberately obnoxious name to prompt caution in its use. + const SchemeHostPort& GetTupleOrPrecursorTupleIfOpaque() const { + return tuple_; + } + + // Efficiently returns what GURL(Serialize()) would without re-parsing the + // URL. This can be used for the (rare) times a GURL representation is needed + // for an Origin. + // Note: The returned URL will not necessarily be serialized to the same value + // as the Origin would. The GURL will have an added "/" path for Origins with + // valid SchemeHostPorts and file Origins. + // + // Try not to use this method under normal circumstances, as it loses type + // information. Downstream consumers can mistake the returned GURL with a full + // URL (e.g. with a path component). + GURL GetURL() const; + + // Same as GURL::DomainIs. If |this| origin is opaque, then returns false. + bool DomainIs(gurl_base::StringPiece canonical_domain) const; + + // Allows Origin to be used as a key in STL (for example, a std::set or + // std::map). + bool operator<(const Origin& other) const; + + // Creates a new opaque origin that is guaranteed to be cross-origin to all + // currently existing origins. An origin created by this method retains its + // identity across copies. Copies are guaranteed to be same-origin to each + // other, e.g. + // + // url::Origin page = Origin::Create(GURL("http://example.com")) + // url::Origin a = page.DeriveNewOpaqueOrigin(); + // url::Origin b = page.DeriveNewOpaqueOrigin(); + // url::Origin c = a; + // url::Origin d = b; + // + // |a| and |c| are same-origin, since |c| was copied from |a|. |b| and |d| are + // same-origin as well, since |d| was copied from |b|. All other combinations + // of origins are considered cross-origin, e.g. |a| is cross-origin to |b| and + // |d|, |b| is cross-origin to |a| and |c|, |c| is cross-origin to |b| and + // |d|, and |d| is cross-origin to |a| and |c|. + Origin DeriveNewOpaqueOrigin() const; + + // Creates a string representation of the object that can be used for logging + // and debugging. It serializes the internal state, such as the nonce value + // and precursor information. + std::string GetDebugString() const; + + private: + friend class blink::SecurityOrigin; + friend class OriginTest; + friend struct mojo::UrlOriginAdapter; + friend struct ipc_fuzzer::FuzzTraits<Origin>; + friend struct mojo::StructTraits<url::mojom::OriginDataView, url::Origin>; + friend IPC::ParamTraits<url::Origin>; + friend COMPONENT_EXPORT(URL) std::ostream& operator<<(std::ostream& out, + const Origin& origin); + + // Origin::Nonce is a wrapper around gurl_base::UnguessableToken that generates + // the random value only when the value is first accessed. The lazy generation + // allows Origin to be default-constructed quickly, without spending time + // in random number generation. + // + // TODO(nick): Should this optimization move into UnguessableToken, once it no + // longer treats the Null case specially? + class COMPONENT_EXPORT(URL) Nonce { + public: + // Creates a nonce to hold a newly-generated UnguessableToken. The actual + // token value will be generated lazily. + Nonce(); + + // Creates a nonce to hold an already-generated UnguessableToken value. This + // constructor should only be used for IPC serialization and testing -- + // regular code should never need to touch the UnguessableTokens directly, + // and the default constructor is faster. + explicit Nonce(const gurl_base::UnguessableToken& token); + + // Accessor, which lazily initializes the underlying |token_| member. + const gurl_base::UnguessableToken& token() const; + + // Do not use in cases where lazy initialization is expected! This + // accessor does not initialize the |token_| member. + const gurl_base::UnguessableToken& raw_token() const; + + // Copyable and movable. Copying a Nonce triggers lazy-initialization, + // moving it does not. + Nonce(const Nonce&); + Nonce& operator=(const Nonce&); + Nonce(Nonce&&); + Nonce& operator=(Nonce&&); + + // Note that operator<, used by maps type containers, will trigger |token_| + // lazy-initialization. Equality comparisons do not. + bool operator<(const Nonce& other) const; + bool operator==(const Nonce& other) const; + bool operator!=(const Nonce& other) const; + + private: + friend class OriginTest; + + // mutable to support lazy generation. + mutable gurl_base::UnguessableToken token_; + }; + + // This needs to be friended within Origin as well, since Nonce is a private + // nested class of Origin. + friend COMPONENT_EXPORT(URL) std::ostream& operator<<(std::ostream& out, + const Nonce& nonce); + + // Creates an origin without sanity checking that the host is canonicalized. + // This should only be used when converting between already normalized types, + // and should NOT be used for IPC. Method takes std::strings for use with move + // operators to avoid copies. + static Origin CreateOpaqueFromNormalizedPrecursorTuple( + std::string precursor_scheme, + std::string precursor_host, + uint16_t precursor_port, + const Nonce& nonce); + + // Creates an opaque Origin with the identity given by |nonce|, and an + // optional precursor origin given by |precursor_scheme|, |precursor_host| and + // |precursor_port|. Returns nullopt if any parameter is not canonical. When + // the precursor is unknown, the precursor parameters should be ("", "", 0). + // + // This factory method should be used in order to pass opaque Origin objects + // back and forth over IPC (as transitioning through GURL would risk + // potentially dangerous recanonicalization). + static gurl_base::Optional<Origin> UnsafelyCreateOpaqueOriginWithoutNormalization( + gurl_base::StringPiece precursor_scheme, + gurl_base::StringPiece precursor_host, + uint16_t precursor_port, + const Nonce& nonce); + + // Constructs a non-opaque tuple origin. |tuple| must be valid. + explicit Origin(SchemeHostPort tuple); + + // Constructs an opaque origin derived from the |precursor| tuple, with the + // given |nonce|. + Origin(const Nonce& nonce, SchemeHostPort precursor); + + // Get the nonce associated with this origin, if it is opaque. This should be + // used only when trying to send an Origin across an IPC pipe. + gurl_base::Optional<gurl_base::UnguessableToken> GetNonceForSerialization() const; + + // The tuple is used for both tuple origins (e.g. https://example.com:80), as + // well as for opaque origins, where it tracks the tuple origin from which + // the opaque origin was initially derived (we call this the "precursor" + // origin). + SchemeHostPort tuple_; + + // The nonce is used for maintaining identity of an opaque origin. This + // nonce is preserved when an opaque origin is copied or moved. An Origin + // is considered opaque if and only if |nonce_| holds a value. + gurl_base::Optional<Nonce> nonce_; +}; + +// Pretty-printers for logging. These expose the internal state of the nonce. +COMPONENT_EXPORT(URL) +std::ostream& operator<<(std::ostream& out, const Origin& origin); +COMPONENT_EXPORT(URL) +std::ostream& operator<<(std::ostream& out, const Origin::Nonce& origin); + +COMPONENT_EXPORT(URL) bool IsSameOriginWith(const GURL& a, const GURL& b); + +// DEBUG_ALIAS_FOR_ORIGIN(var_name, origin) copies |origin| into a new +// stack-allocated variable named |<var_name>|. This helps ensure that the +// value of |origin| gets preserved in crash dumps. +#define DEBUG_ALIAS_FOR_ORIGIN(var_name, origin) \ + DEBUG_ALIAS_FOR_CSTR(var_name, (origin).Serialize().c_str(), 128) + +} // namespace url + +#endif // URL_ORIGIN_H_
diff --git a/url/origin_unittest.cc b/url/origin_unittest.cc new file mode 100644 index 0000000..2754f23 --- /dev/null +++ b/url/origin_unittest.cc
@@ -0,0 +1,866 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <stddef.h> +#include <stdint.h> + +#include "polyfills/base/logging.h" +#include "base/macros.h" +#include "testing/gmock/include/gmock/gmock.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/gurl.h" +#include "url/origin.h" +#include "url/url_util.h" + +namespace url { + +void ExpectParsedUrlsEqual(const GURL& a, const GURL& b) { + EXPECT_EQ(a, b); + const Parsed& a_parsed = a.parsed_for_possibly_invalid_spec(); + const Parsed& b_parsed = b.parsed_for_possibly_invalid_spec(); + EXPECT_EQ(a_parsed.scheme.begin, b_parsed.scheme.begin); + EXPECT_EQ(a_parsed.scheme.len, b_parsed.scheme.len); + EXPECT_EQ(a_parsed.username.begin, b_parsed.username.begin); + EXPECT_EQ(a_parsed.username.len, b_parsed.username.len); + EXPECT_EQ(a_parsed.password.begin, b_parsed.password.begin); + EXPECT_EQ(a_parsed.password.len, b_parsed.password.len); + EXPECT_EQ(a_parsed.host.begin, b_parsed.host.begin); + EXPECT_EQ(a_parsed.host.len, b_parsed.host.len); + EXPECT_EQ(a_parsed.port.begin, b_parsed.port.begin); + EXPECT_EQ(a_parsed.port.len, b_parsed.port.len); + EXPECT_EQ(a_parsed.path.begin, b_parsed.path.begin); + EXPECT_EQ(a_parsed.path.len, b_parsed.path.len); + EXPECT_EQ(a_parsed.query.begin, b_parsed.query.begin); + EXPECT_EQ(a_parsed.query.len, b_parsed.query.len); + EXPECT_EQ(a_parsed.ref.begin, b_parsed.ref.begin); + EXPECT_EQ(a_parsed.ref.len, b_parsed.ref.len); +} + +class OriginTest : public ::testing::Test { + public: + void SetUp() override { + // Add two schemes which are local but nonstandard. + AddLocalScheme("local-but-nonstandard"); + AddLocalScheme("also-local-but-nonstandard"); + + // Add a scheme that's both local and standard. + AddStandardScheme("local-and-standard", SchemeType::SCHEME_WITH_HOST); + AddLocalScheme("local-and-standard"); + + // Add a scheme that's standard but no-access. We still want these to + // form valid SchemeHostPorts, even though they always commit as opaque + // origins, so that they can represent the source of the resource even if + // it's not committable as a non-opaque origin. + AddStandardScheme("standard-but-noaccess", SchemeType::SCHEME_WITH_HOST); + AddNoAccessScheme("standard-but-noaccess"); + } + void TearDown() override { url::ResetForTests(); } + + ::testing::AssertionResult DoEqualityComparisons(const url::Origin& a, + const url::Origin& b, + bool should_compare_equal) { + ::testing::AssertionResult failure = ::testing::AssertionFailure(); + failure << "DoEqualityComparisons failure. Expecting " + << (should_compare_equal ? "equality" : "inequality") + << " between:\n a\n Which is: " << a + << "\n b\n Which is: " << b << "\nThe following check failed: "; + if (a.IsSameOriginWith(b) != should_compare_equal) + return failure << "a.IsSameOriginWith(b)"; + if (b.IsSameOriginWith(a) != should_compare_equal) + return failure << "b.IsSameOriginWith(a)"; + if ((a == b) != should_compare_equal) + return failure << "(a == b)"; + if ((b == a) != should_compare_equal) + return failure << "(b == a)"; + if ((b != a) != !should_compare_equal) + return failure << "(b != a)"; + if ((a != b) != !should_compare_equal) + return failure << "(a != b)"; + return ::testing::AssertionSuccess(); + } + + bool HasNonceTokenBeenInitialized(const url::Origin& origin) { + EXPECT_TRUE(origin.opaque()); + // Avoid calling nonce_.token() here, to not trigger lazy initialization. + return !origin.nonce_->token_.is_empty(); + } + + Origin::Nonce CreateNonce() { return Origin::Nonce(); } + + Origin::Nonce CreateNonce(gurl_base::UnguessableToken nonce) { + return Origin::Nonce(nonce); + } + + gurl_base::Optional<gurl_base::UnguessableToken> GetNonce(const Origin& origin) { + return origin.GetNonceForSerialization(); + } + + // Wrapper around url::Origin method to expose it to tests. + gurl_base::Optional<Origin> UnsafelyCreateOpaqueOriginWithoutNormalization( + gurl_base::StringPiece precursor_scheme, + gurl_base::StringPiece precursor_host, + uint16_t precursor_port, + const Origin::Nonce& nonce) { + return Origin::UnsafelyCreateOpaqueOriginWithoutNormalization( + precursor_scheme, precursor_host, precursor_port, nonce); + } +}; + +TEST_F(OriginTest, OpaqueOriginComparison) { + // A default-constructed Origin should should be cross origin to everything + // but itself. + url::Origin opaque_a, opaque_b; + EXPECT_TRUE(opaque_a.opaque()); + EXPECT_EQ("", opaque_a.scheme()); + EXPECT_EQ("", opaque_a.host()); + EXPECT_EQ(0, opaque_a.port()); + EXPECT_EQ(SchemeHostPort(), opaque_a.GetTupleOrPrecursorTupleIfOpaque()); + EXPECT_TRUE(opaque_a.GetTupleOrPrecursorTupleIfOpaque().IsInvalid()); + + EXPECT_TRUE(opaque_b.opaque()); + EXPECT_EQ("", opaque_b.scheme()); + EXPECT_EQ("", opaque_b.host()); + EXPECT_EQ(0, opaque_b.port()); + EXPECT_EQ(SchemeHostPort(), opaque_b.GetTupleOrPrecursorTupleIfOpaque()); + EXPECT_TRUE(opaque_b.GetTupleOrPrecursorTupleIfOpaque().IsInvalid()); + + // Two default-constructed Origins should always be cross origin to each + // other. + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_b, false)); + EXPECT_TRUE(DoEqualityComparisons(opaque_b, opaque_b, true)); + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_a, true)); + + // The streaming operator should not trigger lazy initialization to the token. + std::ostringstream stream; + stream << opaque_a; + EXPECT_STREQ("null [internally: (nonce TBD) anonymous]", + stream.str().c_str()); + EXPECT_FALSE(HasNonceTokenBeenInitialized(opaque_a)); + + // None of the operations thus far should have triggered lazy-generation of + // the UnguessableToken. Copying an origin, however, should trigger this. + EXPECT_FALSE(HasNonceTokenBeenInitialized(opaque_a)); + EXPECT_FALSE(HasNonceTokenBeenInitialized(opaque_b)); + opaque_b = opaque_a; + + EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_a)); + EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_b)); + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_b, true)); + EXPECT_TRUE(DoEqualityComparisons(opaque_b, opaque_b, true)); + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_a, true)); + + // Move-initializing to a fresh Origin should restore the lazy initialization. + opaque_a = url::Origin(); + EXPECT_FALSE(HasNonceTokenBeenInitialized(opaque_a)); + EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_b)); + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_b, false)); + EXPECT_TRUE(DoEqualityComparisons(opaque_b, opaque_b, true)); + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_a, true)); + + // Comparing two opaque Origins with matching SchemeHostPorts should trigger + // lazy initialization. + EXPECT_FALSE(HasNonceTokenBeenInitialized(opaque_a)); + EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_b)); + bool should_swap = opaque_b < opaque_a; + EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_a)); + EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_b)); + + if (should_swap) + std::swap(opaque_a, opaque_b); + EXPECT_LT(opaque_a, opaque_b); + EXPECT_FALSE(opaque_b < opaque_a); + + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_b, false)); + EXPECT_TRUE(DoEqualityComparisons(opaque_b, opaque_b, true)); + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_a, true)); + + EXPECT_LT(opaque_a, url::Origin::Create(GURL("http://www.google.com"))); + EXPECT_LT(opaque_b, url::Origin::Create(GURL("http://www.google.com"))); + + EXPECT_EQ(opaque_b, url::Origin::Resolve(GURL("about:blank"), opaque_b)); + EXPECT_EQ(opaque_b, url::Origin::Resolve(GURL("about:srcdoc"), opaque_b)); + EXPECT_EQ(opaque_b, + url::Origin::Resolve(GURL("about:blank?hello#whee"), opaque_b)); + + const char* const urls[] = { + "data:text/html,Hello!", + "javascript:alert(1)", + "about:blank", + "file://example.com:443/etc/passwd", + "unknown-scheme:foo", + "unknown-scheme://bar", + "http", + "http:", + "http:/", + "http://", + "http://:", + "http://:1", + "yay", + "http::///invalid.example.com/", + "blob:null/foo", // blob:null (actually a valid URL) + "blob:data:foo", // blob + data (which is nonstandard) + "blob:about://blank/", // blob + about (which is nonstandard) + "blob:about:blank/", // blob + about (which is nonstandard) + "filesystem:http://example.com/", // Invalid (missing /type/) + "filesystem:local-but-nonstandard:baz./type/", // fs requires standard + "filesystem:local-but-nonstandard://hostname/type/", + "filesystem:unknown-scheme://hostname/type/", + "local-but-nonstandar:foo", // Prefix of registered scheme. + "but-nonstandard:foo", // Suffix of registered scheme. + "local-and-standard:", // Standard scheme needs a hostname. + "standard-but-noaccess:", // Standard scheme needs a hostname. + "blob:blob:http://www.example.com/guid-goes-here", // Double blob. + }; + + for (auto* test_url : urls) { + SCOPED_TRACE(test_url); + GURL url(test_url); + const url::Origin opaque_origin; + + // Opaque origins returned by Origin::Create(). + { + Origin origin = Origin::Create(url); + EXPECT_EQ("", origin.scheme()); + EXPECT_EQ("", origin.host()); + EXPECT_EQ(0, origin.port()); + EXPECT_TRUE(origin.opaque()); + // An origin is always same-origin with itself. + EXPECT_EQ(origin, origin); + EXPECT_NE(origin, url::Origin()); + EXPECT_EQ(SchemeHostPort(), origin.GetTupleOrPrecursorTupleIfOpaque()); + // A copy of |origin| should be same-origin as well. + Origin origin_copy = origin; + EXPECT_EQ("", origin_copy.scheme()); + EXPECT_EQ("", origin_copy.host()); + EXPECT_EQ(0, origin_copy.port()); + EXPECT_TRUE(origin_copy.opaque()); + EXPECT_EQ(origin, origin_copy); + // And it should always be cross-origin to another opaque Origin. + EXPECT_NE(origin, opaque_origin); + // Re-creating from the URL should also be cross-origin. + EXPECT_NE(origin, Origin::Create(url)); + + ExpectParsedUrlsEqual(GURL(origin.Serialize()), origin.GetURL()); + } + } +} + +TEST_F(OriginTest, ConstructFromTuple) { + struct TestCases { + const char* const scheme; + const char* const host; + const uint16_t port; + } cases[] = { + {"http", "example.com", 80}, + {"http", "example.com", 123}, + {"https", "example.com", 443}, + }; + + for (const auto& test_case : cases) { + testing::Message scope_message; + scope_message << test_case.scheme << "://" << test_case.host << ":" + << test_case.port; + SCOPED_TRACE(scope_message); + Origin origin = Origin::CreateFromNormalizedTuple( + test_case.scheme, test_case.host, test_case.port); + + EXPECT_EQ(test_case.scheme, origin.scheme()); + EXPECT_EQ(test_case.host, origin.host()); + EXPECT_EQ(test_case.port, origin.port()); + } +} + +TEST_F(OriginTest, ConstructFromGURL) { + Origin different_origin = + Origin::Create(GURL("https://not-in-the-list.test/")); + + struct TestCases { + const char* const url; + const char* const expected_scheme; + const char* const expected_host; + const uint16_t expected_port; + } cases[] = { + // IP Addresses + {"http://192.168.9.1/", "http", "192.168.9.1", 80}, + {"http://[2001:db8::1]/", "http", "[2001:db8::1]", 80}, + {"http://1/", "http", "0.0.0.1", 80}, + {"http://1:1/", "http", "0.0.0.1", 1}, + {"http://3232237825/", "http", "192.168.9.1", 80}, + + // Punycode + {"http://☃.net/", "http", "xn--n3h.net", 80}, + {"blob:http://☃.net/", "http", "xn--n3h.net", 80}, + + // Generic URLs + {"http://example.com/", "http", "example.com", 80}, + {"http://example.com:123/", "http", "example.com", 123}, + {"https://example.com/", "https", "example.com", 443}, + {"https://example.com:123/", "https", "example.com", 123}, + {"http://user:pass@example.com/", "http", "example.com", 80}, + {"http://example.com:123/?query", "http", "example.com", 123}, + {"https://example.com/#1234", "https", "example.com", 443}, + {"https://u:p@example.com:123/?query#1234", "https", "example.com", 123}, + + // Registered URLs + {"ftp://example.com/", "ftp", "example.com", 21}, + {"gopher://example.com/", "gopher", "example.com", 70}, + {"ws://example.com/", "ws", "example.com", 80}, + {"wss://example.com/", "wss", "example.com", 443}, + {"wss://user:pass@example.com/", "wss", "example.com", 443}, + + // Scheme (registered in SetUp()) that's both local and standard. + // TODO: Is it really appropriate to do network-host canonicalization of + // schemes without ports? + {"local-and-standard:20", "local-and-standard", "0.0.0.20", 0}, + {"local-and-standard:20.", "local-and-standard", "0.0.0.20", 0}, + {"local-and-standard:↑↑↓↓←→←→ba.↑↑↓↓←→←→ba.0.bg", "local-and-standard", + "xn--ba-rzuadaibfa.xn--ba-rzuadaibfa.0.bg", 0}, + {"local-and-standard:foo", "local-and-standard", "foo", 0}, + {"local-and-standard://bar:20", "local-and-standard", "bar", 0}, + {"local-and-standard:baz.", "local-and-standard", "baz.", 0}, + {"local-and-standard:baz..", "local-and-standard", "baz..", 0}, + {"local-and-standard:baz..bar", "local-and-standard", "baz..bar", 0}, + {"local-and-standard:baz...", "local-and-standard", "baz...", 0}, + + // Scheme (registered in SetUp()) that's local but nonstandard. These + // always have empty hostnames, but are allowed to be url::Origins. + {"local-but-nonstandard:", "local-but-nonstandard", "", 0}, + {"local-but-nonstandard:foo", "local-but-nonstandard", "", 0}, + {"local-but-nonstandard://bar", "local-but-nonstandard", "", 0}, + {"also-local-but-nonstandard://bar", "also-local-but-nonstandard", "", 0}, + + // Scheme (registered in SetUp()) that's standard but marked as noaccess. + // url::Origin doesn't currently take the noaccess property into account, + // so these aren't expected to result in opaque origins. + {"standard-but-noaccess:foo", "standard-but-noaccess", "foo", 0}, + {"standard-but-noaccess://bar", "standard-but-noaccess", "bar", 0}, + + // file: URLs + {"file:///etc/passwd", "file", "", 0}, + {"file://example.com/etc/passwd", "file", "example.com", 0}, + + // Filesystem: + {"filesystem:http://example.com/type/", "http", "example.com", 80}, + {"filesystem:http://example.com:123/type/", "http", "example.com", 123}, + {"filesystem:https://example.com/type/", "https", "example.com", 443}, + {"filesystem:https://example.com:123/type/", "https", "example.com", 123}, + {"filesystem:local-and-standard:baz./type/", "local-and-standard", "baz.", + 0}, + + // Blob: + {"blob:http://example.com/guid-goes-here", "http", "example.com", 80}, + {"blob:http://example.com:123/guid-goes-here", "http", "example.com", + 123}, + {"blob:https://example.com/guid-goes-here", "https", "example.com", 443}, + {"blob:http://u:p@example.com/guid-goes-here", "http", "example.com", 80}, + + // Gopher: + {"gopher://8u.9.Vx6", "gopher", "8u.9.vx6", 70}, + }; + + for (const auto& test_case : cases) { + SCOPED_TRACE(test_case.url); + GURL url(test_case.url); + EXPECT_TRUE(url.is_valid()); + Origin origin = Origin::Create(url); + EXPECT_EQ(test_case.expected_scheme, origin.scheme()); + EXPECT_EQ(test_case.expected_host, origin.host()); + EXPECT_EQ(test_case.expected_port, origin.port()); + EXPECT_FALSE(origin.opaque()); + EXPECT_EQ(origin, origin); + EXPECT_NE(different_origin, origin); + EXPECT_NE(origin, different_origin); + EXPECT_EQ(origin, Origin::Resolve(GURL("about:blank"), origin)); + EXPECT_EQ(origin, Origin::Resolve(GURL("about:blank?bar#foo"), origin)); + + ExpectParsedUrlsEqual(GURL(origin.Serialize()), origin.GetURL()); + + url::Origin derived_opaque = + Origin::Resolve(GURL("about:blank?bar#foo"), origin) + .DeriveNewOpaqueOrigin(); + EXPECT_TRUE(derived_opaque.opaque()); + EXPECT_NE(origin, derived_opaque); + EXPECT_FALSE(derived_opaque.GetTupleOrPrecursorTupleIfOpaque().IsInvalid()); + EXPECT_EQ(origin.GetTupleOrPrecursorTupleIfOpaque(), + derived_opaque.GetTupleOrPrecursorTupleIfOpaque()); + EXPECT_EQ(derived_opaque, derived_opaque); + + url::Origin derived_opaque_via_data_url = + Origin::Resolve(GURL("data:text/html,baz"), origin); + EXPECT_TRUE(derived_opaque_via_data_url.opaque()); + EXPECT_NE(origin, derived_opaque_via_data_url); + EXPECT_FALSE(derived_opaque_via_data_url.GetTupleOrPrecursorTupleIfOpaque() + .IsInvalid()); + EXPECT_EQ(origin.GetTupleOrPrecursorTupleIfOpaque(), + derived_opaque_via_data_url.GetTupleOrPrecursorTupleIfOpaque()); + EXPECT_NE(derived_opaque, derived_opaque_via_data_url); + EXPECT_NE(derived_opaque_via_data_url, derived_opaque); + EXPECT_NE(derived_opaque.DeriveNewOpaqueOrigin(), derived_opaque); + EXPECT_EQ(derived_opaque_via_data_url, derived_opaque_via_data_url); + } +} + +TEST_F(OriginTest, Serialization) { + struct TestCases { + const char* const url; + const char* const expected; + const char* const expected_log; + } cases[] = { + {"http://192.168.9.1/", "http://192.168.9.1"}, + {"http://[2001:db8::1]/", "http://[2001:db8::1]"}, + {"http://☃.net/", "http://xn--n3h.net"}, + {"http://example.com/", "http://example.com"}, + {"http://example.com:123/", "http://example.com:123"}, + {"https://example.com/", "https://example.com"}, + {"https://example.com:123/", "https://example.com:123"}, + {"file:///etc/passwd", "file://", "file:// [internally: file://]"}, + {"file://example.com/etc/passwd", "file://", + "file:// [internally: file://example.com]"}, + {"data:,", "null", "null [internally: (nonce TBD) anonymous]"}, + }; + + for (const auto& test_case : cases) { + SCOPED_TRACE(test_case.url); + GURL url(test_case.url); + EXPECT_TRUE(url.is_valid()); + Origin origin = Origin::Create(url); + std::string serialized = origin.Serialize(); + ExpectParsedUrlsEqual(GURL(serialized), origin.GetURL()); + + EXPECT_EQ(test_case.expected, serialized); + + // The '<<' operator sometimes produces additional information. + std::stringstream out; + out << origin; + if (test_case.expected_log) + EXPECT_EQ(test_case.expected_log, out.str()); + else + EXPECT_EQ(test_case.expected, out.str()); + } +} + +TEST_F(OriginTest, Comparison) { + // These URLs are arranged in increasing order: + const char* const urls[] = { + "data:uniqueness", "http://a:80", "http://b:80", + "https://a:80", "https://b:80", "http://a:81", + "http://b:81", "https://a:81", "https://b:81", + }; + // Validate the comparison logic still works when creating a canonical origin, + // when any created opaque origins contain a nonce. + { + // Pre-create the origins, as the internal nonce for unique origins changes + // with each freshly-constructed Origin (that's not copied). + std::vector<Origin> origins; + for (const auto* test_url : urls) + origins.push_back(Origin::Create(GURL(test_url))); + for (size_t i = 0; i < origins.size(); i++) { + const Origin& current = origins[i]; + for (size_t j = i; j < origins.size(); j++) { + const Origin& to_compare = origins[j]; + EXPECT_EQ(i < j, current < to_compare) << i << " < " << j; + EXPECT_EQ(j < i, to_compare < current) << j << " < " << i; + } + } + } +} + +TEST_F(OriginTest, UnsafelyCreate) { + struct TestCase { + const char* scheme; + const char* host; + uint16_t port; + } cases[] = { + {"http", "example.com", 80}, + {"http", "example.com", 123}, + {"https", "example.com", 443}, + {"https", "example.com", 123}, + {"file", "", 0}, + {"file", "example.com", 0}, + }; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() + << test.scheme << "://" << test.host << ":" << test.port); + gurl_base::Optional<url::Origin> origin = + url::Origin::UnsafelyCreateTupleOriginWithoutNormalization( + test.scheme, test.host, test.port); + ASSERT_TRUE(origin); + EXPECT_EQ(test.scheme, origin->scheme()); + EXPECT_EQ(test.host, origin->host()); + EXPECT_EQ(test.port, origin->port()); + EXPECT_FALSE(origin->opaque()); + EXPECT_TRUE(origin->IsSameOriginWith(*origin)); + + ExpectParsedUrlsEqual(GURL(origin->Serialize()), origin->GetURL()); + + gurl_base::UnguessableToken nonce = gurl_base::UnguessableToken::Create(); + gurl_base::Optional<url::Origin> opaque_origin = + UnsafelyCreateOpaqueOriginWithoutNormalization( + test.scheme, test.host, test.port, CreateNonce(nonce)); + ASSERT_TRUE(opaque_origin); + EXPECT_TRUE(opaque_origin->opaque()); + EXPECT_FALSE(*opaque_origin == origin); + EXPECT_EQ(opaque_origin->GetTupleOrPrecursorTupleIfOpaque(), + origin->GetTupleOrPrecursorTupleIfOpaque()); + EXPECT_EQ(opaque_origin, + UnsafelyCreateOpaqueOriginWithoutNormalization( + test.scheme, test.host, test.port, CreateNonce(nonce))); + EXPECT_FALSE(*opaque_origin == origin->DeriveNewOpaqueOrigin()); + } +} + +TEST_F(OriginTest, UnsafelyCreateUniqueOnInvalidInput) { + url::AddStandardScheme("host-only", url::SCHEME_WITH_HOST); + url::AddStandardScheme("host-port-only", url::SCHEME_WITH_HOST_AND_PORT); + struct TestCases { + const char* scheme; + const char* host; + uint16_t port = 80; + } cases[] = {{"", "", 33}, + {"data", "", 0}, + {"blob", "", 0}, + {"filesystem", "", 0}, + {"data", "example.com"}, + {"http", "☃.net"}, + {"http\nmore", "example.com"}, + {"http\rmore", "example.com"}, + {"http\n", "example.com"}, + {"http\r", "example.com"}, + {"http", "example.com\nnot-example.com"}, + {"http", "example.com\rnot-example.com"}, + {"http", "example.com\n"}, + {"http", "example.com\r"}, + {"http", "example.com", 0}, + {"unknown-scheme", "example.com"}, + {"host-only", "\r", 0}, + {"host-only", "example.com", 22}, + {"host-port-only", "example.com", 0}, + {"file", ""}}; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() + << test.scheme << "://" << test.host << ":" << test.port); + EXPECT_FALSE(UnsafelyCreateOpaqueOriginWithoutNormalization( + test.scheme, test.host, test.port, CreateNonce())); + EXPECT_FALSE(url::Origin::UnsafelyCreateTupleOriginWithoutNormalization( + test.scheme, test.host, test.port)); + } + + // An empty scheme/host/port tuple is not a valid tuple origin. + EXPECT_FALSE( + url::Origin::UnsafelyCreateTupleOriginWithoutNormalization("", "", 0)); + + // Opaque origins with unknown precursors are allowed. + gurl_base::UnguessableToken token = gurl_base::UnguessableToken::Create(); + gurl_base::Optional<url::Origin> anonymous_opaque = + UnsafelyCreateOpaqueOriginWithoutNormalization("", "", 0, + CreateNonce(token)); + ASSERT_TRUE(anonymous_opaque) + << "An invalid tuple is a valid input to " + << "UnsafelyCreateOpaqueOriginWithoutNormalization, so long as it is " + << "the canonical form of the invalid tuple."; + EXPECT_TRUE(anonymous_opaque->opaque()); + EXPECT_EQ(GetNonce(anonymous_opaque.value()), token); + EXPECT_EQ(anonymous_opaque->GetTupleOrPrecursorTupleIfOpaque(), + url::SchemeHostPort()); +} + +TEST_F(OriginTest, UnsafelyCreateUniqueViaEmbeddedNulls) { + struct TestCases { + gurl_base::StringPiece scheme; + gurl_base::StringPiece host; + uint16_t port = 80; + } cases[] = {{{"http\0more", 9}, {"example.com", 11}}, + {{"http\0", 5}, {"example.com", 11}}, + {{"\0http", 5}, {"example.com", 11}}, + {{"http"}, {"example.com\0not-example.com", 27}}, + {{"http"}, {"example.com\0", 12}}, + {{"http"}, {"\0example.com", 12}}, + {{""}, {"\0", 1}, 0}, + {{"\0", 1}, {""}, 0}}; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() + << test.scheme << "://" << test.host << ":" << test.port); + EXPECT_FALSE(url::Origin::UnsafelyCreateTupleOriginWithoutNormalization( + test.scheme, test.host, test.port)); + EXPECT_FALSE(UnsafelyCreateOpaqueOriginWithoutNormalization( + test.scheme, test.host, test.port, CreateNonce())); + } +} + +TEST_F(OriginTest, DomainIs) { + const struct { + const char* url; + const char* lower_ascii_domain; + bool expected_domain_is; + } kTestCases[] = { + {"http://google.com/foo", "google.com", true}, + {"http://www.google.com:99/foo", "google.com", true}, + {"http://www.google.com.cn/foo", "google.com", false}, + {"http://www.google.comm", "google.com", false}, + {"http://www.iamnotgoogle.com/foo", "google.com", false}, + {"http://www.google.com/foo", "Google.com", false}, + + // If the host ends with a dot, it matches domains with or without a dot. + {"http://www.google.com./foo", "google.com", true}, + {"http://www.google.com./foo", "google.com.", true}, + {"http://www.google.com./foo", ".com", true}, + {"http://www.google.com./foo", ".com.", true}, + + // But, if the host doesn't end with a dot and the input domain does, then + // it's considered to not match. + {"http://google.com/foo", "google.com.", false}, + + // If the host ends with two dots, it doesn't match. + {"http://www.google.com../foo", "google.com", false}, + + // Filesystem scheme. + {"filesystem:http://www.google.com:99/foo/", "google.com", true}, + {"filesystem:http://www.iamnotgoogle.com/foo/", "google.com", false}, + + // File scheme. + {"file:///home/user/text.txt", "", false}, + {"file:///home/user/text.txt", "txt", false}, + }; + + for (const auto& test_case : kTestCases) { + SCOPED_TRACE(testing::Message() << "(url, domain): (" << test_case.url + << ", " << test_case.lower_ascii_domain + << ")"); + GURL url(test_case.url); + ASSERT_TRUE(url.is_valid()); + Origin origin = Origin::Create(url); + + EXPECT_EQ(test_case.expected_domain_is, + origin.DomainIs(test_case.lower_ascii_domain)); + EXPECT_FALSE( + origin.DeriveNewOpaqueOrigin().DomainIs(test_case.lower_ascii_domain)); + } + + // If the URL is invalid, DomainIs returns false. + GURL invalid_url("google.com"); + ASSERT_FALSE(invalid_url.is_valid()); + EXPECT_FALSE(Origin::Create(invalid_url).DomainIs("google.com")); + + // Unique origins. + EXPECT_FALSE(Origin().DomainIs("")); + EXPECT_FALSE(Origin().DomainIs("com")); +} + +TEST_F(OriginTest, DebugAlias) { + Origin origin1 = Origin::Create(GURL("https://foo.com/bar")); + DEBUG_ALIAS_FOR_ORIGIN(origin1_debug_alias, origin1); + EXPECT_STREQ("https://foo.com", origin1_debug_alias); +} + +TEST_F(OriginTest, NonStandardScheme) { + Origin origin = Origin::Create(GURL("cow://")); + EXPECT_TRUE(origin.opaque()); +} +TEST_F(OriginTest, NonStandardSchemeWithAndroidWebViewHack) { + EnableNonStandardSchemesForAndroidWebView(); + Origin origin = Origin::Create(GURL("cow://")); + EXPECT_FALSE(origin.opaque()); + EXPECT_EQ("cow", origin.scheme()); + EXPECT_EQ("", origin.host()); + EXPECT_EQ(0, origin.port()); + ResetForTests(); +} + +TEST_F(OriginTest, CanBeDerivedFrom) { + Origin opaque_unique_origin = Origin(); + + Origin regular_origin = Origin::Create(GURL("https://a.com/")); + Origin opaque_precursor_origin = regular_origin.DeriveNewOpaqueOrigin(); + + Origin file_origin = Origin::Create(GURL("file:///foo/bar")); + Origin file_opaque_precursor_origin = file_origin.DeriveNewOpaqueOrigin(); + Origin file_host_origin = Origin::Create(GURL("file://a.com/foo/bar")); + Origin file_host_opaque_precursor_origin = + file_host_origin.DeriveNewOpaqueOrigin(); + + Origin non_standard_scheme_origin = + Origin::Create(GURL("non-standard-scheme:foo")); + Origin non_standard_opaque_precursor_origin = + non_standard_scheme_origin.DeriveNewOpaqueOrigin(); + + // Also, add new standard scheme that is local to the test. + AddStandardScheme("new-standard", SchemeType::SCHEME_WITH_HOST); + Origin new_standard_origin = Origin::Create(GURL("new-standard://host/")); + Origin new_standard_opaque_precursor_origin = + new_standard_origin.DeriveNewOpaqueOrigin(); + + // No access schemes always get unique opaque origins. + Origin no_access_origin = + Origin::Create(GURL("standard-but-noaccess://b.com")); + Origin no_access_opaque_precursor_origin = + no_access_origin.DeriveNewOpaqueOrigin(); + + Origin local_non_standard_origin = + Origin::Create(GURL("local-but-nonstandard://a.com")); + Origin local_non_standard_opaque_precursor_origin = + local_non_standard_origin.DeriveNewOpaqueOrigin(); + + // Call origin.CanBeDerivedFrom(url) for each of the following test cases + // and ensure that it returns |expected_value| + const struct { + const char* url; + Origin* origin; + bool expected_value; + } kTestCases[] = { + {"https://a.com", ®ular_origin, true}, + // Web URL can commit in an opaque origin with precursor information. + // Example: iframe sandbox navigated to a.com. + {"https://a.com", &opaque_precursor_origin, true}, + // URL that comes from the web can never commit in an opaque unique + // origin. It must have precursor information. + {"https://a.com", &opaque_unique_origin, false}, + + // Cross-origin URLs should never work. + {"https://b.com", ®ular_origin, false}, + {"https://b.com", &opaque_precursor_origin, false}, + + // data: URL can never commit in a regular, non-opaque origin. + {"data:text/html,foo", ®ular_origin, false}, + // This is the default case: data: URLs commit in opaque origin carrying + // precursor information for the origin that created them. + {"data:text/html,foo", &opaque_precursor_origin, true}, + // Browser-initiated navigations can result in data: URL committing in + // opaque unique origin. + {"data:text/html,foo", &opaque_unique_origin, true}, + + // about:blank can commit in regular origin (default case for iframes). + {"about:blank", ®ular_origin, true}, + // This can happen if data: URL that originated at a.com creates an + // about:blank iframe. + {"about:blank", &opaque_precursor_origin, true}, + // Browser-initiated navigations can result in about:blank URL committing + // in opaque unique origin. + {"about:blank", &opaque_unique_origin, true}, + + // Default behavior of srcdoc is to inherit the origin of the parent + // document. + {"about:srcdoc", ®ular_origin, true}, + // This happens for sandboxed srcdoc iframe. + {"about:srcdoc", &opaque_precursor_origin, true}, + // This can happen with browser-initiated navigation to about:blank or + // data: URL, which in turn add srcdoc iframe. + {"about:srcdoc", &opaque_unique_origin, true}, + + // Just like srcdoc, blob: URLs can be created in all the cases. + {"blob:https://a.com/foo", ®ular_origin, true}, + {"blob:https://a.com/foo", &opaque_precursor_origin, true}, + {"blob:https://a.com/foo", &opaque_unique_origin, true}, + + {"filesystem:https://a.com/foo", ®ular_origin, true}, + {"filesystem:https://a.com/foo", &opaque_precursor_origin, true}, + // Unlike blob: URLs, filesystem: ones cannot be created in an unique + // opaque origin. + {"filesystem:https://a.com/foo", &opaque_unique_origin, false}, + + // file: URLs cannot result in regular web origins, regardless of + // opaqueness. + {"file:///etc/passwd", ®ular_origin, false}, + {"file:///etc/passwd", &opaque_precursor_origin, false}, + // However, they can result in regular file: origin and an opaque one + // containing another file: origin as precursor. + {"file:///etc/passwd", &file_origin, true}, + {"file:///etc/passwd", &file_opaque_precursor_origin, true}, + // It should not be possible to get an opaque unique origin for file: + // as it is a standard scheme and will always result in a tuple origin + // or will always be derived by other origin. + // Note: file:// URLs should become unique opaque origins at some point. + {"file:///etc/passwd", &opaque_unique_origin, false}, + + // The same set as above, but including a host. + {"file://a.com/etc/passwd", ®ular_origin, false}, + {"file://a.com/etc/passwd", &opaque_precursor_origin, false}, + {"file://a.com/etc/passwd", &file_host_origin, true}, + {"file://a.com/etc/passwd", &file_host_opaque_precursor_origin, true}, + {"file://a.com/etc/passwd", &opaque_unique_origin, false}, + + // Locally registered standard scheme should behave the same way + // as built-in standard schemes. + {"new-standard://host/foo", &new_standard_origin, true}, + {"new-standard://host/foo", &new_standard_opaque_precursor_origin, true}, + {"new-standard://host/foo", &opaque_unique_origin, false}, + {"new-standard://host2/foo", &new_standard_origin, false}, + {"new-standard://host2/foo", &new_standard_opaque_precursor_origin, + false}, + + // A non-standard scheme should never commit in an standard origin or + // opaque origin with standard precursor information. + {"non-standard-scheme://a.com/foo", ®ular_origin, false}, + {"non-standard-scheme://a.com/foo", &opaque_precursor_origin, false}, + // However, it should be fine to commit in unique opaque origins or in its + // own origin. + // Note: since non-standard scheme URLs don't parse out anything + // but the scheme, using a random different hostname here would work. + {"non-standard-scheme://b.com/foo2", &opaque_unique_origin, true}, + {"non-standard-scheme://b.com/foo3", &non_standard_scheme_origin, true}, + {"non-standard-scheme://b.com/foo4", + &non_standard_opaque_precursor_origin, true}, + + // No access scheme can only commit in opaque origin. + {"standard-but-noaccess://a.com/foo", ®ular_origin, false}, + {"standard-but-noaccess://a.com/foo", &opaque_precursor_origin, false}, + {"standard-but-noaccess://a.com/foo", &opaque_unique_origin, true}, + {"standard-but-noaccess://a.com/foo", &no_access_origin, false}, + {"standard-but-noaccess://a.com/foo", &no_access_opaque_precursor_origin, + false}, + {"standard-but-noaccess://b.com/foo", &no_access_origin, false}, + {"standard-but-noaccess://b.com/foo", &no_access_opaque_precursor_origin, + true}, + + // Local schemes can be non-standard, verify they also work as expected. + {"local-but-nonstandard://a.com", ®ular_origin, false}, + {"local-but-nonstandard://a.com", &opaque_precursor_origin, false}, + {"local-but-nonstandard://a.com", &opaque_unique_origin, true}, + {"local-but-nonstandard://a.com", &local_non_standard_origin, true}, + {"local-but-nonstandard://a.com", + &local_non_standard_opaque_precursor_origin, true}, + }; + + for (const auto& test_case : kTestCases) { + SCOPED_TRACE(testing::Message() << "(origin, url): (" << *test_case.origin + << ", " << test_case.url << ")"); + EXPECT_EQ(test_case.expected_value, + test_case.origin->CanBeDerivedFrom(GURL(test_case.url))); + } +} + +TEST_F(OriginTest, GetDebugString) { + Origin http_origin = Origin::Create(GURL("http://192.168.9.1")); + EXPECT_STREQ(http_origin.GetDebugString().c_str(), "http://192.168.9.1"); + + Origin http_opaque_origin = http_origin.DeriveNewOpaqueOrigin(); + EXPECT_THAT( + http_opaque_origin.GetDebugString().c_str(), + ::testing::MatchesRegex( + "null \\[internally: \\(\\w*\\) derived from http://192.168.9.1\\]")); + + Origin data_origin = Origin::Create(GURL("data:")); + EXPECT_STREQ(data_origin.GetDebugString().c_str(), + "null [internally: (nonce TBD) anonymous]"); + + // The nonce of the origin will be initialized if a new opaque origin is + // derived. + Origin data_derived_origin = data_origin.DeriveNewOpaqueOrigin(); + EXPECT_THAT( + data_derived_origin.GetDebugString().c_str(), + ::testing::MatchesRegex("null \\[internally: \\(\\w*\\) anonymous\\]")); + + Origin file_origin = Origin::Create(GURL("file:///etc/passwd")); + EXPECT_STREQ(file_origin.GetDebugString().c_str(), + "file:// [internally: file://]"); + + Origin file_server_origin = + Origin::Create(GURL("file://example.com/etc/passwd")); + EXPECT_STREQ(file_server_origin.GetDebugString().c_str(), + "file:// [internally: file://example.com]"); +} + +} // namespace url
diff --git a/url/run_all_perftests.cc b/url/run_all_perftests.cc new file mode 100644 index 0000000..be7a746 --- /dev/null +++ b/url/run_all_perftests.cc
@@ -0,0 +1,14 @@ +// Copyright 2019 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/bind.h" +#include "base/test/launcher/unit_test_launcher.h" +#include "base/test/perf_test_suite.h" + +int main(int argc, char** argv) { + gurl_base::PerfTestSuite test_suite(argc, argv); + return gurl_base::LaunchUnitTestsSerially( + argc, argv, + gurl_base::BindOnce(&gurl_base::TestSuite::Run, gurl_base::Unretained(&test_suite))); +}
diff --git a/url/run_all_unittests.cc b/url/run_all_unittests.cc new file mode 100644 index 0000000..0f6a431 --- /dev/null +++ b/url/run_all_unittests.cc
@@ -0,0 +1,27 @@ +// Copyright 2016 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <memory> + +#include "base/bind.h" +#include "base/test/launcher/unit_test_launcher.h" +#include "base/test/test_io_thread.h" +#include "base/test/test_suite.h" +#include "build/build_config.h" + +#if !defined(OS_IOS) +#include "mojo/core/embedder/embedder.h" // nogncheck +#endif + +int main(int argc, char** argv) { + gurl_base::TestSuite test_suite(argc, argv); + +#if !defined(OS_IOS) + mojo::core::Init(); +#endif + + return gurl_base::LaunchUnitTests( + argc, argv, + gurl_base::BindOnce(&gurl_base::TestSuite::Run, gurl_base::Unretained(&test_suite))); +}
diff --git a/url/scheme_host_port.cc b/url/scheme_host_port.cc new file mode 100644 index 0000000..290e8a6 --- /dev/null +++ b/url/scheme_host_port.cc
@@ -0,0 +1,268 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/scheme_host_port.h" + +#include <stdint.h> +#include <string.h> + +#include <tuple> + +#include "polyfills/base/logging.h" +#include "base/numerics/safe_conversions.h" +#include "base/stl_util.h" +#include "base/strings/string_number_conversions.h" +#include "url/gurl.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon.h" +#include "url/url_canon_stdstring.h" +#include "url/url_constants.h" +#include "url/url_util.h" + +namespace url { + +namespace { + +bool IsCanonicalHost(const gurl_base::StringPiece& host) { + std::string canon_host; + + // Try to canonicalize the host (copy/pasted from net/base. :( ). + const Component raw_host_component(0, + gurl_base::checked_cast<int>(host.length())); + StdStringCanonOutput canon_host_output(&canon_host); + CanonHostInfo host_info; + CanonicalizeHostVerbose(host.data(), raw_host_component, + &canon_host_output, &host_info); + + if (host_info.out_host.is_nonempty() && + host_info.family != CanonHostInfo::BROKEN) { + // Success! Assert that there's no extra garbage. + canon_host_output.Complete(); + GURL_DCHECK_EQ(host_info.out_host.len, static_cast<int>(canon_host.length())); + } else { + // Empty host, or canonicalization failed. + canon_host.clear(); + } + + return host == canon_host; +} + +bool IsValidInput(const gurl_base::StringPiece& scheme, + const gurl_base::StringPiece& host, + uint16_t port, + SchemeHostPort::ConstructPolicy policy) { + // Empty schemes are never valid. + if (scheme.empty()) + return false; + + SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + bool is_standard = GetStandardSchemeType( + scheme.data(), + Component(0, gurl_base::checked_cast<int>(scheme.length())), + &scheme_type); + if (!is_standard) { + // To be consistent with blink, local non-standard schemes are currently + // allowed to be tuple origins. Nonstandard schemes don't have hostnames, + // so their tuple is just ("protocol", "", 0). + // + // TODO: Migrate "content:" and "externalfile:" to be standard schemes, and + // remove this local scheme exception. + if (gurl_base::Contains(GetLocalSchemes(), scheme) && host.empty() && port == 0) + return true; + + // Otherwise, allow non-standard schemes only if the Android WebView + // workaround is enabled. + return AllowNonStandardSchemesForAndroidWebView(); + } + + switch (scheme_type) { + case SCHEME_WITH_HOST_AND_PORT: + case SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION: + // A URL with |scheme| is required to have the host and port (may be + // omitted in a serialization if it's the same as the default value). + // Return an invalid instance if either of them is not given. + if (host.empty() || port == 0) + return false; + + // Don't do an expensive canonicalization if the host is already + // canonicalized. + GURL_DCHECK(policy == SchemeHostPort::CHECK_CANONICALIZATION || + IsCanonicalHost(host)); + if (policy == SchemeHostPort::CHECK_CANONICALIZATION && + !IsCanonicalHost(host)) { + return false; + } + + return true; + + case SCHEME_WITH_HOST: + if (port != 0) { + // Return an invalid object if a URL with the scheme never represents + // the port data but the given |port| is non-zero. + return false; + } + + // Don't do an expensive canonicalization if the host is already + // canonicalized. + GURL_DCHECK(policy == SchemeHostPort::CHECK_CANONICALIZATION || + IsCanonicalHost(host)); + if (policy == SchemeHostPort::CHECK_CANONICALIZATION && + !IsCanonicalHost(host)) { + return false; + } + + return true; + + case SCHEME_WITHOUT_AUTHORITY: + return false; + + default: + GURL_NOTREACHED(); + return false; + } +} + +} // namespace + +SchemeHostPort::SchemeHostPort() : port_(0) { +} + +SchemeHostPort::SchemeHostPort(std::string scheme, + std::string host, + uint16_t port, + ConstructPolicy policy) + : port_(0) { + if (!IsValidInput(scheme, host, port, policy)) { + GURL_DCHECK(IsInvalid()); + return; + } + + scheme_ = std::move(scheme); + host_ = std::move(host); + port_ = port; + GURL_DCHECK(!IsInvalid()) << "Scheme: " << scheme_ << " Host: " << host_ + << " Port: " << port; +} + +SchemeHostPort::SchemeHostPort(gurl_base::StringPiece scheme, + gurl_base::StringPiece host, + uint16_t port) + : SchemeHostPort(scheme.as_string(), + host.as_string(), + port, + ConstructPolicy::CHECK_CANONICALIZATION) {} + +SchemeHostPort::SchemeHostPort(const GURL& url) : port_(0) { + if (!url.is_valid()) + return; + + gurl_base::StringPiece scheme = url.scheme_piece(); + gurl_base::StringPiece host = url.host_piece(); + + // A valid GURL never returns PORT_INVALID. + int port = url.EffectiveIntPort(); + if (port == PORT_UNSPECIFIED) { + port = 0; + } else { + GURL_DCHECK_GE(port, 0); + GURL_DCHECK_LE(port, 65535); + } + + if (!IsValidInput(scheme, host, port, ALREADY_CANONICALIZED)) + return; + + scheme.CopyToString(&scheme_); + host.CopyToString(&host_); + port_ = port; +} + +SchemeHostPort::~SchemeHostPort() = default; + +bool SchemeHostPort::IsInvalid() const { + // It suffices to just check |scheme_| for emptiness; the other fields are + // never present without it. + GURL_DCHECK(!scheme_.empty() || host_.empty()); + GURL_DCHECK(!scheme_.empty() || port_ == 0); + return scheme_.empty(); +} + +std::string SchemeHostPort::Serialize() const { + // Null checking for |parsed| in SerializeInternal is probably slower than + // just filling it in and discarding it here. + url::Parsed parsed; + return SerializeInternal(&parsed); +} + +GURL SchemeHostPort::GetURL() const { + url::Parsed parsed; + std::string serialized = SerializeInternal(&parsed); + + if (IsInvalid()) + return GURL(std::move(serialized), parsed, false); + + // SchemeHostPort does not have enough information to determine if an empty + // host is valid or not for the given scheme. Force re-parsing. + GURL_DCHECK(!scheme_.empty()); + if (host_.empty()) + return GURL(serialized); + + // If the serialized string is passed to GURL for parsing, it will append an + // empty path "/". Add that here. Note: per RFC 6454 we cannot do this for + // normal Origin serialization. + GURL_DCHECK(!parsed.path.is_valid()); + parsed.path = Component(serialized.length(), 1); + serialized.append("/"); + return GURL(std::move(serialized), parsed, true); +} + +bool SchemeHostPort::operator<(const SchemeHostPort& other) const { + return std::tie(port_, scheme_, host_) < + std::tie(other.port_, other.scheme_, other.host_); +} + +std::string SchemeHostPort::SerializeInternal(url::Parsed* parsed) const { + std::string result; + if (IsInvalid()) + return result; + + // Reserve enough space for the "normal" case of scheme://host/. + result.reserve(scheme_.size() + host_.size() + 4); + + if (!scheme_.empty()) { + parsed->scheme = Component(0, scheme_.length()); + result.append(scheme_); + } + + result.append(kStandardSchemeSeparator); + + if (!host_.empty()) { + parsed->host = Component(result.length(), host_.length()); + result.append(host_); + } + + if (port_ == 0) + return result; + + // Omit the port component if the port matches with the default port + // defined for the scheme, if any. + int default_port = DefaultPortForScheme(scheme_.data(), + static_cast<int>(scheme_.length())); + if (default_port == PORT_UNSPECIFIED) + return result; + if (port_ != default_port) { + result.push_back(':'); + std::string port(gurl_base::NumberToString(port_)); + parsed->port = Component(result.length(), port.length()); + result.append(std::move(port)); + } + + return result; +} + +std::ostream& operator<<(std::ostream& out, + const SchemeHostPort& scheme_host_port) { + return out << scheme_host_port.Serialize(); +} + +} // namespace url
diff --git a/url/scheme_host_port.h b/url/scheme_host_port.h new file mode 100644 index 0000000..a2dded1 --- /dev/null +++ b/url/scheme_host_port.h
@@ -0,0 +1,170 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_SCHEME_HOST_PORT_H_ +#define URL_SCHEME_HOST_PORT_H_ + +#include <stdint.h> + +#include <string> + +#include "polyfills/base/component_export.h" +#include "base/strings/string_piece.h" + +class GURL; + +namespace url { + +struct Parsed; + +// This class represents a (scheme, host, port) tuple extracted from a URL. +// +// The primary purpose of this class is to represent relevant network-authority +// information for a URL. It is _not_ an Origin, as described in RFC 6454. In +// particular, it is generally NOT the right thing to use for security +// decisions. +// +// Instead, this class is a mechanism for simplifying URLs with standard schemes +// (that is, those which follow the generic syntax of RFC 3986) down to the +// uniquely identifying information necessary for network fetches. This makes it +// suitable as a cache key for a collection of active connections, for instance. +// It may, however, be inappropriate to use as a cache key for persistent +// storage associated with a host. +// +// In particular, note that: +// +// * SchemeHostPort can only represent schemes which follow the RFC 3986 syntax +// (e.g. those registered with GURL as "standard schemes"). Non-standard +// schemes such as "blob", "filesystem", "data", and "javascript" can only be +// represented as invalid SchemeHostPort objects. +// +// * For example, the "file" scheme follows the standard syntax, but it is +// important to note that the authority portion (host, port) is optional. +// URLs without an authority portion will be represented with an empty string +// for the host, and a port of 0 (e.g. "file:///etc/hosts" => +// ("file", "", 0)), and URLs with a host-only authority portion will be +// represented with a port of 0 (e.g. "file://example.com/etc/hosts" => +// ("file", "example.com", 0)). See Section 3 of RFC 3986 to better understand +// these constructs. +// +// * SchemeHostPort has no notion of the Origin concept (RFC 6454), and in +// particular, it has no notion of an opaque Origin. If you need to take +// opaque origins into account (and, if you're making security-relevant +// decisions then you absolutely do), please use 'url::Origin' instead. +// +// Usage: +// +// * SchemeHostPort objects are commonly created from GURL objects: +// +// GURL url("https://example.com/"); +// url::SchemeHostPort tuple(url); +// tuple.scheme(); // "https" +// tuple.host(); // "example.com" +// tuple.port(); // 443 +// +// * Objects may also be explicitly created and compared: +// +// url::SchemeHostPort tuple(url::kHttpsScheme, "example.com", 443); +// tuple.scheme(); // "https" +// tuple.host(); // "example.com" +// tuple.port(); // 443 +// +// GURL url("https://example.com/"); +// tuple == url::SchemeHostPort(url); // true +class COMPONENT_EXPORT(URL) SchemeHostPort { + public: + // Creates an invalid (scheme, host, port) tuple, which represents an invalid + // or non-standard URL. + SchemeHostPort(); + + // Creates a (scheme, host, port) tuple. |host| must be a canonicalized + // A-label (that is, '☃.net' must be provided as 'xn--n3h.net'). |scheme| + // must be a standard scheme. |port| must not be 0, unless |scheme| does not + // support ports (e.g. 'file'). In that case, |port| must be 0. + // + // Copies the data in |scheme| and |host|. + SchemeHostPort(gurl_base::StringPiece scheme, + gurl_base::StringPiece host, + uint16_t port); + + // Metadata influencing whether or not the constructor should sanity check + // host canonicalization. + enum ConstructPolicy { CHECK_CANONICALIZATION, ALREADY_CANONICALIZED }; + + // Creates a (scheme, host, port) tuple without performing sanity checking + // that the host and port are canonicalized. This should only be used when + // converting between already normalized types, and should NOT be used for + // IPC. + SchemeHostPort(std::string scheme, + std::string host, + uint16_t port, + ConstructPolicy policy); + + // Creates a (scheme, host, port) tuple from |url|, as described at + // https://tools.ietf.org/html/rfc6454#section-4 + // + // If |url| is invalid or non-standard, the result will be an invalid + // SchemeHostPort object. + explicit SchemeHostPort(const GURL& url); + + // Copyable and movable. + SchemeHostPort(const SchemeHostPort&) = default; + SchemeHostPort& operator=(const SchemeHostPort&) = default; + SchemeHostPort(SchemeHostPort&&) = default; + SchemeHostPort& operator=(SchemeHostPort&&) = default; + + ~SchemeHostPort(); + + // Returns the host component, in URL form. That is all IDN domain names will + // be expressed as A-Labels ('☃.net' will be returned as 'xn--n3h.net'), and + // and all IPv6 addresses will be enclosed in brackets ("[2001:db8::1]"). + const std::string& host() const { return host_; } + const std::string& scheme() const { return scheme_; } + uint16_t port() const { return port_; } + bool IsInvalid() const; + + // Serializes the SchemeHostPort tuple to a canonical form. + // + // While this string form resembles the Origin serialization specified in + // Section 6.2 of RFC 6454, it is important to note that invalid + // SchemeHostPort tuples serialize to the empty string, rather than being + // serialized as would an opaque Origin. + std::string Serialize() const; + + // Efficiently returns what GURL(Serialize()) would return, without needing to + // re-parse the URL. + GURL GetURL() const; + + // Two SchemeHostPort objects are "equal" iff their schemes, hosts, and ports + // are exact matches. + // + // Note that this comparison is _not_ the same as an origin-based comparison. + // In particular, invalid SchemeHostPort objects match each other (and + // themselves). Opaque origins, on the other hand, would not. + bool operator==(const SchemeHostPort& other) const { + return port_ == other.port() && scheme_ == other.scheme() && + host_ == other.host(); + } + bool operator!=(const SchemeHostPort& other) const { + return !(*this == other); + } + // Allows SchemeHostPort to be used as a key in STL (for example, a std::set + // or std::map). + bool operator<(const SchemeHostPort& other) const; + + private: + std::string SerializeInternal(url::Parsed* parsed) const; + + std::string scheme_; + std::string host_; + uint16_t port_; +}; + +COMPONENT_EXPORT(URL) +std::ostream& operator<<(std::ostream& out, + const SchemeHostPort& scheme_host_port); + +} // namespace url + +#endif // URL_SCHEME_HOST_PORT_H_
diff --git a/url/scheme_host_port_unittest.cc b/url/scheme_host_port_unittest.cc new file mode 100644 index 0000000..5270c70 --- /dev/null +++ b/url/scheme_host_port_unittest.cc
@@ -0,0 +1,285 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <stddef.h> +#include <stdint.h> + +#include "base/stl_util.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/gurl.h" +#include "url/scheme_host_port.h" +#include "url/url_util.h" + +namespace { + +class SchemeHostPortTest : public testing::Test { + public: + SchemeHostPortTest() = default; + ~SchemeHostPortTest() override { + // Reset any added schemes. + url::ResetForTests(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(SchemeHostPortTest); +}; + +void ExpectParsedUrlsEqual(const GURL& a, const GURL& b) { + EXPECT_EQ(a, b); + const url::Parsed& a_parsed = a.parsed_for_possibly_invalid_spec(); + const url::Parsed& b_parsed = b.parsed_for_possibly_invalid_spec(); + EXPECT_EQ(a_parsed.scheme.begin, b_parsed.scheme.begin); + EXPECT_EQ(a_parsed.scheme.len, b_parsed.scheme.len); + EXPECT_EQ(a_parsed.username.begin, b_parsed.username.begin); + EXPECT_EQ(a_parsed.username.len, b_parsed.username.len); + EXPECT_EQ(a_parsed.password.begin, b_parsed.password.begin); + EXPECT_EQ(a_parsed.password.len, b_parsed.password.len); + EXPECT_EQ(a_parsed.host.begin, b_parsed.host.begin); + EXPECT_EQ(a_parsed.host.len, b_parsed.host.len); + EXPECT_EQ(a_parsed.port.begin, b_parsed.port.begin); + EXPECT_EQ(a_parsed.port.len, b_parsed.port.len); + EXPECT_EQ(a_parsed.path.begin, b_parsed.path.begin); + EXPECT_EQ(a_parsed.path.len, b_parsed.path.len); + EXPECT_EQ(a_parsed.query.begin, b_parsed.query.begin); + EXPECT_EQ(a_parsed.query.len, b_parsed.query.len); + EXPECT_EQ(a_parsed.ref.begin, b_parsed.ref.begin); + EXPECT_EQ(a_parsed.ref.len, b_parsed.ref.len); +} + +TEST_F(SchemeHostPortTest, Invalid) { + url::SchemeHostPort invalid; + EXPECT_EQ("", invalid.scheme()); + EXPECT_EQ("", invalid.host()); + EXPECT_EQ(0, invalid.port()); + EXPECT_TRUE(invalid.IsInvalid()); + EXPECT_EQ(invalid, invalid); + + const char* urls[] = { + "data:text/html,Hello!", "javascript:alert(1)", + "file://example.com:443/etc/passwd", + + // These schemes do not follow the generic URL syntax, so make sure we + // treat them as invalid (scheme, host, port) tuples (even though such + // URLs' _Origin_ might have a (scheme, host, port) tuple, they themselves + // do not). This is only *implicitly* checked in the code, by means of + // blob schemes not being standard, and filesystem schemes having type + // SCHEME_WITHOUT_AUTHORITY. If conditions change such that the implicit + // checks no longer hold, this policy should be made explicit. + "blob:https://example.com/uuid-goes-here", + "filesystem:https://example.com/temporary/yay.png"}; + + for (auto* test : urls) { + SCOPED_TRACE(test); + GURL url(test); + url::SchemeHostPort tuple(url); + EXPECT_EQ("", tuple.scheme()); + EXPECT_EQ("", tuple.host()); + EXPECT_EQ(0, tuple.port()); + EXPECT_TRUE(tuple.IsInvalid()); + EXPECT_EQ(tuple, tuple); + EXPECT_EQ(tuple, invalid); + EXPECT_EQ(invalid, tuple); + ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL()); + } +} + +TEST_F(SchemeHostPortTest, ExplicitConstruction) { + struct TestCases { + const char* scheme; + const char* host; + uint16_t port; + } cases[] = { + {"http", "example.com", 80}, + {"http", "example.com", 123}, + {"https", "example.com", 443}, + {"https", "example.com", 123}, + {"file", "", 0}, + {"file", "example.com", 0}, + }; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":" + << test.port); + url::SchemeHostPort tuple(test.scheme, test.host, test.port); + EXPECT_EQ(test.scheme, tuple.scheme()); + EXPECT_EQ(test.host, tuple.host()); + EXPECT_EQ(test.port, tuple.port()); + EXPECT_FALSE(tuple.IsInvalid()); + EXPECT_EQ(tuple, tuple); + ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL()); + } +} + +TEST_F(SchemeHostPortTest, InvalidConstruction) { + struct TestCases { + const char* scheme; + const char* host; + uint16_t port; + } cases[] = {{"", "", 0}, + {"data", "", 0}, + {"blob", "", 0}, + {"filesystem", "", 0}, + {"http", "", 80}, + {"data", "example.com", 80}, + {"http", "☃.net", 80}, + {"http\nmore", "example.com", 80}, + {"http\rmore", "example.com", 80}, + {"http\n", "example.com", 80}, + {"http\r", "example.com", 80}, + {"http", "example.com\nnot-example.com", 80}, + {"http", "example.com\rnot-example.com", 80}, + {"http", "example.com\n", 80}, + {"http", "example.com\r", 80}, + {"http", "example.com", 0}, + {"file", "", 80}}; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":" + << test.port); + url::SchemeHostPort tuple(test.scheme, test.host, test.port); + EXPECT_EQ("", tuple.scheme()); + EXPECT_EQ("", tuple.host()); + EXPECT_EQ(0, tuple.port()); + EXPECT_TRUE(tuple.IsInvalid()); + EXPECT_EQ(tuple, tuple); + ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL()); + } +} + +TEST_F(SchemeHostPortTest, InvalidConstructionWithEmbeddedNulls) { + struct TestCases { + const char* scheme; + size_t scheme_length; + const char* host; + size_t host_length; + uint16_t port; + } cases[] = {{"http\0more", 9, "example.com", 11, 80}, + {"http\0", 5, "example.com", 11, 80}, + {"\0http", 5, "example.com", 11, 80}, + {"http", 4, "example.com\0not-example.com", 27, 80}, + {"http", 4, "example.com\0", 12, 80}, + {"http", 4, "\0example.com", 12, 80}}; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":" + << test.port); + url::SchemeHostPort tuple(std::string(test.scheme, test.scheme_length), + std::string(test.host, test.host_length), + test.port); + EXPECT_EQ("", tuple.scheme()); + EXPECT_EQ("", tuple.host()); + EXPECT_EQ(0, tuple.port()); + EXPECT_TRUE(tuple.IsInvalid()); + ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL()); + } +} + +TEST_F(SchemeHostPortTest, GURLConstruction) { + struct TestCases { + const char* url; + const char* scheme; + const char* host; + uint16_t port; + } cases[] = { + {"http://192.168.9.1/", "http", "192.168.9.1", 80}, + {"http://[2001:db8::1]/", "http", "[2001:db8::1]", 80}, + {"http://☃.net/", "http", "xn--n3h.net", 80}, + {"http://example.com/", "http", "example.com", 80}, + {"http://example.com:123/", "http", "example.com", 123}, + {"https://example.com/", "https", "example.com", 443}, + {"https://example.com:123/", "https", "example.com", 123}, + {"file:///etc/passwd", "file", "", 0}, + {"file://example.com/etc/passwd", "file", "example.com", 0}, + {"http://u:p@example.com/", "http", "example.com", 80}, + {"http://u:p@example.com/path", "http", "example.com", 80}, + {"http://u:p@example.com/path?123", "http", "example.com", 80}, + {"http://u:p@example.com/path?123#hash", "http", "example.com", 80}, + }; + + for (const auto& test : cases) { + SCOPED_TRACE(test.url); + GURL url(test.url); + EXPECT_TRUE(url.is_valid()); + url::SchemeHostPort tuple(url); + EXPECT_EQ(test.scheme, tuple.scheme()); + EXPECT_EQ(test.host, tuple.host()); + EXPECT_EQ(test.port, tuple.port()); + EXPECT_FALSE(tuple.IsInvalid()); + EXPECT_EQ(tuple, tuple); + ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL()); + } +} + +TEST_F(SchemeHostPortTest, Serialization) { + struct TestCases { + const char* url; + const char* expected; + } cases[] = { + {"http://192.168.9.1/", "http://192.168.9.1"}, + {"http://[2001:db8::1]/", "http://[2001:db8::1]"}, + {"http://☃.net/", "http://xn--n3h.net"}, + {"http://example.com/", "http://example.com"}, + {"http://example.com:123/", "http://example.com:123"}, + {"https://example.com/", "https://example.com"}, + {"https://example.com:123/", "https://example.com:123"}, + {"file:///etc/passwd", "file://"}, + {"file://example.com/etc/passwd", "file://example.com"}, + }; + + for (const auto& test : cases) { + SCOPED_TRACE(test.url); + GURL url(test.url); + url::SchemeHostPort tuple(url); + EXPECT_EQ(test.expected, tuple.Serialize()); + ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL()); + } +} + +TEST_F(SchemeHostPortTest, Comparison) { + // These tuples are arranged in increasing order: + struct SchemeHostPorts { + const char* scheme; + const char* host; + uint16_t port; + } tuples[] = { + {"http", "a", 80}, + {"http", "b", 80}, + {"https", "a", 80}, + {"https", "b", 80}, + {"http", "a", 81}, + {"http", "b", 81}, + {"https", "a", 81}, + {"https", "b", 81}, + }; + + for (size_t i = 0; i < gurl_base::size(tuples); i++) { + url::SchemeHostPort current(tuples[i].scheme, tuples[i].host, + tuples[i].port); + for (size_t j = i; j < gurl_base::size(tuples); j++) { + url::SchemeHostPort to_compare(tuples[j].scheme, tuples[j].host, + tuples[j].port); + EXPECT_EQ(i < j, current < to_compare) << i << " < " << j; + EXPECT_EQ(j < i, to_compare < current) << j << " < " << i; + } + } +} + +// Some schemes have optional authority. Make sure that GURL conversion from +// SchemeHostPort is not opinionated in that regard. For more info, See +// crbug.com/820194, where we considered all SchemeHostPorts with +// SCHEME_WITH_HOST (i.e., without ports) as valid with empty hosts, even though +// most are not (e.g. chrome URLs). +TEST_F(SchemeHostPortTest, EmptyHostGurlConversion) { + url::AddStandardScheme("chrome", url::SCHEME_WITH_HOST); + + GURL chrome_url("chrome:"); + EXPECT_FALSE(chrome_url.is_valid()); + + url::SchemeHostPort chrome_tuple("chrome", "", 0); + EXPECT_FALSE(chrome_tuple.GetURL().is_valid()); + ExpectParsedUrlsEqual(GURL(chrome_tuple.Serialize()), chrome_tuple.GetURL()); + ExpectParsedUrlsEqual(chrome_url, chrome_tuple.GetURL()); +} + +} // namespace url
diff --git a/url/third_party/mozilla/LICENSE.txt b/url/third_party/mozilla/LICENSE.txt new file mode 100644 index 0000000..ac40837 --- /dev/null +++ b/url/third_party/mozilla/LICENSE.txt
@@ -0,0 +1,65 @@ +Copyright 2007, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------------------- + +The file url_parse.cc is based on nsURLParsers.cc from Mozilla. This file is +licensed separately as follows: + +The contents of this file are subject to the Mozilla Public License Version +1.1 (the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at +http://www.mozilla.org/MPL/ + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +for the specific language governing rights and limitations under the +License. + +The Original Code is mozilla.org code. + +The Initial Developer of the Original Code is +Netscape Communications Corporation. +Portions created by the Initial Developer are Copyright (C) 1998 +the Initial Developer. All Rights Reserved. + +Contributor(s): + Darin Fisher (original author) + +Alternatively, the contents of this file may be used under the terms of +either the GNU General Public License Version 2 or later (the "GPL"), or +the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +in which case the provisions of the GPL or the LGPL are applicable instead +of those above. If you wish to allow use of your version of this file only +under the terms of either the GPL or the LGPL, and not to allow others to +use your version of this file under the terms of the MPL, indicate your +decision by deleting the provisions above and replace them with the notice +and other provisions required by the GPL or the LGPL. If you do not delete +the provisions above, a recipient may use your version of this file under +the terms of any one of the MPL, the GPL or the LGPL.
diff --git a/url/third_party/mozilla/OWNERS b/url/third_party/mozilla/OWNERS new file mode 100644 index 0000000..3605f48 --- /dev/null +++ b/url/third_party/mozilla/OWNERS
@@ -0,0 +1 @@ +# COMPONENT: Internals>Core
diff --git a/url/third_party/mozilla/README.chromium b/url/third_party/mozilla/README.chromium new file mode 100644 index 0000000..ef396d3 --- /dev/null +++ b/url/third_party/mozilla/README.chromium
@@ -0,0 +1,8 @@ +Name: url_parse +URL: http://mxr.mozilla.org/comm-central/source/mozilla/netwerk/base/src/nsURLParsers.cpp +License: BSD and MPL 1.1/GPL 2.0/LGPL 2.1 +License File: LICENSE.txt + +Description: + +The file url_parse.cc is based on nsURLParsers.cc from Mozilla.
diff --git a/url/third_party/mozilla/url_parse.cc b/url/third_party/mozilla/url_parse.cc new file mode 100644 index 0000000..8756cf7 --- /dev/null +++ b/url/third_party/mozilla/url_parse.cc
@@ -0,0 +1,945 @@ +/* Based on nsURLParsers.cc from Mozilla + * ------------------------------------- + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Darin Fisher (original author) + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "url/third_party/mozilla/url_parse.h" + +#include <stdlib.h> + +#include "polyfills/base/logging.h" +#include "url/url_parse_internal.h" +#include "url/url_util.h" +#include "url/url_util_internal.h" + +namespace url { + +namespace { + +// Returns true if the given character is a valid digit to use in a port. +inline bool IsPortDigit(gurl_base::char16 ch) { + return ch >= '0' && ch <= '9'; +} + +// Returns the offset of the next authority terminator in the input starting +// from start_offset. If no terminator is found, the return value will be equal +// to spec_len. +template<typename CHAR> +int FindNextAuthorityTerminator(const CHAR* spec, + int start_offset, + int spec_len) { + for (int i = start_offset; i < spec_len; i++) { + if (IsAuthorityTerminator(spec[i])) + return i; + } + return spec_len; // Not found. +} + +template<typename CHAR> +void ParseUserInfo(const CHAR* spec, + const Component& user, + Component* username, + Component* password) { + // Find the first colon in the user section, which separates the username and + // password. + int colon_offset = 0; + while (colon_offset < user.len && spec[user.begin + colon_offset] != ':') + colon_offset++; + + if (colon_offset < user.len) { + // Found separator: <username>:<password> + *username = Component(user.begin, colon_offset); + *password = MakeRange(user.begin + colon_offset + 1, + user.begin + user.len); + } else { + // No separator, treat everything as the username + *username = user; + *password = Component(); + } +} + +template<typename CHAR> +void ParseServerInfo(const CHAR* spec, + const Component& serverinfo, + Component* hostname, + Component* port_num) { + if (serverinfo.len == 0) { + // No server info, host name is empty. + hostname->reset(); + port_num->reset(); + return; + } + + // If the host starts with a left-bracket, assume the entire host is an + // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal. + // This assumption will be overridden if we find a right-bracket. + // + // Our IPv6 address canonicalization code requires both brackets to exist, + // but the ability to locate an incomplete address can still be useful. + int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1; + int colon = -1; + + // Find the last right-bracket, and the last colon. + for (int i = serverinfo.begin; i < serverinfo.end(); i++) { + switch (spec[i]) { + case ']': + ipv6_terminator = i; + break; + case ':': + colon = i; + break; + } + } + + if (colon > ipv6_terminator) { + // Found a port number: <hostname>:<port> + *hostname = MakeRange(serverinfo.begin, colon); + if (hostname->len == 0) + hostname->reset(); + *port_num = MakeRange(colon + 1, serverinfo.end()); + } else { + // No port: <hostname> + *hostname = serverinfo; + port_num->reset(); + } +} + +// Given an already-identified auth section, breaks it into its consituent +// parts. The port number will be parsed and the resulting integer will be +// filled into the given *port variable, or -1 if there is no port number or it +// is invalid. +template<typename CHAR> +void DoParseAuthority(const CHAR* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + GURL_DCHECK(auth.is_valid()) << "We should always get an authority"; + if (auth.len == 0) { + username->reset(); + password->reset(); + hostname->reset(); + port_num->reset(); + return; + } + + // Search backwards for @, which is the separator between the user info and + // the server info. + int i = auth.begin + auth.len - 1; + while (i > auth.begin && spec[i] != '@') + i--; + + if (spec[i] == '@') { + // Found user info: <user-info>@<server-info> + ParseUserInfo(spec, Component(auth.begin, i - auth.begin), + username, password); + ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len), + hostname, port_num); + } else { + // No user info, everything is server info. + username->reset(); + password->reset(); + ParseServerInfo(spec, auth, hostname, port_num); + } +} + +template <typename CHAR> +inline void FindQueryAndRefParts(const CHAR* spec, + const Component& path, + int* query_separator, + int* ref_separator) { + int path_end = path.begin + path.len; + for (int i = path.begin; i < path_end; i++) { + switch (spec[i]) { + case '?': + // Only match the query string if it precedes the reference fragment + // and when we haven't found one already. + if (*query_separator < 0) + *query_separator = i; + break; + case '#': + // Record the first # sign only. + if (*ref_separator < 0) { + *ref_separator = i; + return; + } + break; + } + } +} + +template<typename CHAR> +void ParsePath(const CHAR* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<ref> + + // Special case when there is no path. + if (path.len == -1) { + filepath->reset(); + query->reset(); + ref->reset(); + return; + } + GURL_DCHECK(path.len > 0) << "We should never have 0 length paths"; + + // Search for first occurrence of either ? or #. + int query_separator = -1; // Index of the '?' + int ref_separator = -1; // Index of the '#' + FindQueryAndRefParts(spec, path, &query_separator, &ref_separator); + + // Markers pointing to the character after each of these corresponding + // components. The code below words from the end back to the beginning, + // and will update these indices as it finds components that exist. + int file_end, query_end; + + // Ref fragment: from the # to the end of the path. + int path_end = path.begin + path.len; + if (ref_separator >= 0) { + file_end = query_end = ref_separator; + *ref = MakeRange(ref_separator + 1, path_end); + } else { + file_end = query_end = path_end; + ref->reset(); + } + + // Query fragment: everything from the ? to the next boundary (either the end + // of the path or the ref fragment). + if (query_separator >= 0) { + file_end = query_separator; + *query = MakeRange(query_separator + 1, query_end); + } else { + query->reset(); + } + + // File path: treat an empty file path as no file path. + if (file_end != path.begin) + *filepath = MakeRange(path.begin, file_end); + else + filepath->reset(); +} + +template<typename CHAR> +bool DoExtractScheme(const CHAR* url, + int url_len, + Component* scheme) { + // Skip leading whitespace and control characters. + int begin = 0; + while (begin < url_len && ShouldTrimFromURL(url[begin])) + begin++; + if (begin == url_len) + return false; // Input is empty or all whitespace. + + // Find the first colon character. + for (int i = begin; i < url_len; i++) { + if (url[i] == ':') { + *scheme = MakeRange(begin, i); + return true; + } + } + return false; // No colon found: no scheme +} + +// Fills in all members of the Parsed structure except for the scheme. +// +// |spec| is the full spec being parsed, of length |spec_len|. +// |after_scheme| is the character immediately following the scheme (after the +// colon) where we'll begin parsing. +// +// Compatability data points. I list "host", "path" extracted: +// Input IE6 Firefox Us +// ----- -------------- -------------- -------------- +// http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/" +// http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/" +// http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// +// (*) Interestingly, although IE fails to load these URLs, its history +// canonicalizer handles them, meaning if you've been to the corresponding +// "http://foo.com/" link, it will be colored. +template <typename CHAR> +void DoParseAfterScheme(const CHAR* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); + int after_slashes = after_scheme + num_slashes; + + // First split into two main parts, the authority (username, password, host, + // and port) and the full path (path, query, and reference). + Component authority; + Component full_path; + + // Found "//<some data>", looks like an authority section. Treat everything + // from there to the next slash (or end of spec) to be the authority. Note + // that we ignore the number of slashes and treat it as the authority. + int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len); + authority = Component(after_slashes, end_auth - after_slashes); + + if (end_auth == spec_len) // No beginning of path found. + full_path = Component(); + else // Everything starting from the slash to the end is the path. + full_path = Component(end_auth, spec_len - end_auth); + + // Now parse those two sub-parts. + DoParseAuthority(spec, authority, &parsed->username, &parsed->password, + &parsed->host, &parsed->port); + ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref); +} + +// The main parsing function for standard URLs. Standard URLs have a scheme, +// host, path, etc. +template<typename CHAR> +void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) { + GURL_DCHECK(spec_len >= 0); + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + int after_scheme; + if (DoExtractScheme(spec, spec_len, &parsed->scheme)) { + after_scheme = parsed->scheme.end() + 1; // Skip past the colon. + } else { + // Say there's no scheme when there is no colon. We could also say that + // everything is the scheme. Both would produce an invalid URL, but this way + // seems less wrong in more cases. + parsed->scheme.reset(); + after_scheme = begin; + } + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +template<typename CHAR> +void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) { + GURL_DCHECK(spec_len >= 0); + + // Get the unused parts of the URL out of the way. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->path.reset(); // May use this; reset for convenience. + parsed->ref.reset(); // May use this; reset for convenience. + parsed->query.reset(); // May use this; reset for convenience. + parsed->clear_inner_parsed(); // May use this; reset for convenience. + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (begin == spec_len) { + parsed->scheme.reset(); + return; + } + + int inner_start = -1; + + // Extract the scheme. We also handle the case where there is no scheme. + if (DoExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + + if (parsed->scheme.end() == spec_len - 1) + return; + + inner_start = parsed->scheme.end() + 1; + } else { + // No scheme found; that's not valid for filesystem URLs. + parsed->scheme.reset(); + return; + } + + Component inner_scheme; + const CHAR* inner_spec = &spec[inner_start]; + int inner_spec_len = spec_len - inner_start; + + if (DoExtractScheme(inner_spec, inner_spec_len, &inner_scheme)) { + // Offset the results since we gave ExtractScheme a substring. + inner_scheme.begin += inner_start; + + if (inner_scheme.end() == spec_len - 1) + return; + } else { + // No scheme found; that's not valid for filesystem URLs. + // The best we can do is return "filesystem://". + return; + } + + Parsed inner_parsed; + + if (CompareSchemeComponent(spec, inner_scheme, kFileScheme)) { + // File URLs are special. + ParseFileURL(inner_spec, inner_spec_len, &inner_parsed); + } else if (CompareSchemeComponent(spec, inner_scheme, kFileSystemScheme)) { + // Filesystem URLs don't nest. + return; + } else if (IsStandard(spec, inner_scheme)) { + // All "normal" URLs. + DoParseStandardURL(inner_spec, inner_spec_len, &inner_parsed); + } else { + return; + } + + // All members of inner_parsed need to be offset by inner_start. + // If we had any scheme that supported nesting more than one level deep, + // we'd have to recurse into the inner_parsed's inner_parsed when + // adjusting by inner_start. + inner_parsed.scheme.begin += inner_start; + inner_parsed.username.begin += inner_start; + inner_parsed.password.begin += inner_start; + inner_parsed.host.begin += inner_start; + inner_parsed.port.begin += inner_start; + inner_parsed.query.begin += inner_start; + inner_parsed.ref.begin += inner_start; + inner_parsed.path.begin += inner_start; + + // Query and ref move from inner_parsed to parsed. + parsed->query = inner_parsed.query; + inner_parsed.query.reset(); + parsed->ref = inner_parsed.ref; + inner_parsed.ref.reset(); + + parsed->set_inner_parsed(inner_parsed); + if (!inner_parsed.scheme.is_valid() || !inner_parsed.path.is_valid() || + inner_parsed.inner_parsed()) { + return; + } + + // The path in inner_parsed should start with a slash, then have a filesystem + // type followed by a slash. From the first slash up to but excluding the + // second should be what it keeps; the rest goes to parsed. If the path ends + // before the second slash, it's still pretty clear what the user meant, so + // we'll let that through. + if (!IsURLSlash(spec[inner_parsed.path.begin])) { + return; + } + int inner_path_end = inner_parsed.path.begin + 1; // skip the leading slash + while (inner_path_end < spec_len && + !IsURLSlash(spec[inner_path_end])) + ++inner_path_end; + parsed->path.begin = inner_path_end; + int new_inner_path_length = inner_path_end - inner_parsed.path.begin; + parsed->path.len = inner_parsed.path.len - new_inner_path_length; + parsed->inner_parsed()->path.len = new_inner_path_length; +} + +// Initializes a path URL which is merely a scheme followed by a path. Examples +// include "about:foo" and "javascript:alert('bar');" +template<typename CHAR> +void DoParsePathURL(const CHAR* spec, int spec_len, + bool trim_path_end, + Parsed* parsed) { + // Get the non-path and non-scheme parts of the URL out of the way, we never + // use them. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->path.reset(); + parsed->query.reset(); + parsed->ref.reset(); + + // Strip leading & trailing spaces and control characters. + int scheme_begin = 0; + TrimURL(spec, &scheme_begin, &spec_len, trim_path_end); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (scheme_begin == spec_len) { + parsed->scheme.reset(); + parsed->path.reset(); + return; + } + + int path_begin; + // Extract the scheme, with the path being everything following. We also + // handle the case where there is no scheme. + if (ExtractScheme(&spec[scheme_begin], spec_len - scheme_begin, + &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += scheme_begin; + path_begin = parsed->scheme.end() + 1; + } else { + // No scheme case. + parsed->scheme.reset(); + path_begin = scheme_begin; + } + + if (path_begin == spec_len) + return; + GURL_DCHECK_LT(path_begin, spec_len); + + ParsePath(spec, + MakeRange(path_begin, spec_len), + &parsed->path, + &parsed->query, + &parsed->ref); +} + +template<typename CHAR> +void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) { + GURL_DCHECK(spec_len >= 0); + + // Get the non-path and non-scheme parts of the URL out of the way, we never + // use them. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->ref.reset(); + parsed->query.reset(); // May use this; reset for convenience. + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (begin == spec_len) { + parsed->scheme.reset(); + parsed->path.reset(); + return; + } + + int path_begin = -1; + int path_end = -1; + + // Extract the scheme, with the path being everything following. We also + // handle the case where there is no scheme. + if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + + if (parsed->scheme.end() != spec_len - 1) { + path_begin = parsed->scheme.end() + 1; + path_end = spec_len; + } + } else { + // No scheme found, just path. + parsed->scheme.reset(); + path_begin = begin; + path_end = spec_len; + } + + // Split [path_begin, path_end) into a path + query. + for (int i = path_begin; i < path_end; ++i) { + if (spec[i] == '?') { + parsed->query = MakeRange(i + 1, path_end); + path_end = i; + break; + } + } + + // For compatability with the standard URL parser, treat no path as + // -1, rather than having a length of 0 + if (path_begin == path_end) { + parsed->path.reset(); + } else { + parsed->path = MakeRange(path_begin, path_end); + } +} + +// Converts a port number in a string to an integer. We'd like to just call +// sscanf but our input is not NULL-terminated, which sscanf requires. Instead, +// we copy the digits to a small stack buffer (since we know the maximum number +// of digits in a valid port number) that we can NULL terminate. +template<typename CHAR> +int DoParsePort(const CHAR* spec, const Component& component) { + // Easy success case when there is no port. + const int kMaxDigits = 5; + if (!component.is_nonempty()) + return PORT_UNSPECIFIED; + + // Skip over any leading 0s. + Component digits_comp(component.end(), 0); + for (int i = 0; i < component.len; i++) { + if (spec[component.begin + i] != '0') { + digits_comp = MakeRange(component.begin + i, component.end()); + break; + } + } + if (digits_comp.len == 0) + return 0; // All digits were 0. + + // Verify we don't have too many digits (we'll be copying to our buffer so + // we need to double-check). + if (digits_comp.len > kMaxDigits) + return PORT_INVALID; + + // Copy valid digits to the buffer. + char digits[kMaxDigits + 1]; // +1 for null terminator + for (int i = 0; i < digits_comp.len; i++) { + CHAR ch = spec[digits_comp.begin + i]; + if (!IsPortDigit(ch)) { + // Invalid port digit, fail. + return PORT_INVALID; + } + digits[i] = static_cast<char>(ch); + } + + // Null-terminate the string and convert to integer. Since we guarantee + // only digits, atoi's lack of error handling is OK. + digits[digits_comp.len] = 0; + int port = atoi(digits); + if (port > 65535) + return PORT_INVALID; // Out of range. + return port; +} + +template<typename CHAR> +void DoExtractFileName(const CHAR* spec, + const Component& path, + Component* file_name) { + // Handle empty paths: they have no file names. + if (!path.is_nonempty()) { + file_name->reset(); + return; + } + + // Extract the filename range from the path which is between + // the last slash and the following semicolon. + int file_end = path.end(); + for (int i = path.end() - 1; i >= path.begin; i--) { + if (spec[i] == ';') { + file_end = i; + } else if (IsURLSlash(spec[i])) { + // File name is everything following this character to the end + *file_name = MakeRange(i + 1, file_end); + return; + } + } + + // No slash found, this means the input was degenerate (generally paths + // will start with a slash). Let's call everything the file name. + *file_name = MakeRange(path.begin, file_end); + return; +} + +template<typename CHAR> +bool DoExtractQueryKeyValue(const CHAR* spec, + Component* query, + Component* key, + Component* value) { + if (!query->is_nonempty()) + return false; + + int start = query->begin; + int cur = start; + int end = query->end(); + + // We assume the beginning of the input is the beginning of the "key" and we + // skip to the end of it. + key->begin = cur; + while (cur < end && spec[cur] != '&' && spec[cur] != '=') + cur++; + key->len = cur - key->begin; + + // Skip the separator after the key (if any). + if (cur < end && spec[cur] == '=') + cur++; + + // Find the value part. + value->begin = cur; + while (cur < end && spec[cur] != '&') + cur++; + value->len = cur - value->begin; + + // Finally skip the next separator if any + if (cur < end && spec[cur] == '&') + cur++; + + // Save the new query + *query = MakeRange(cur, end); + return true; +} + +} // namespace + +Parsed::Parsed() : potentially_dangling_markup(false), inner_parsed_(NULL) {} + +Parsed::Parsed(const Parsed& other) + : scheme(other.scheme), + username(other.username), + password(other.password), + host(other.host), + port(other.port), + path(other.path), + query(other.query), + ref(other.ref), + potentially_dangling_markup(other.potentially_dangling_markup), + inner_parsed_(NULL) { + if (other.inner_parsed_) + set_inner_parsed(*other.inner_parsed_); +} + +Parsed& Parsed::operator=(const Parsed& other) { + if (this != &other) { + scheme = other.scheme; + username = other.username; + password = other.password; + host = other.host; + port = other.port; + path = other.path; + query = other.query; + ref = other.ref; + potentially_dangling_markup = other.potentially_dangling_markup; + if (other.inner_parsed_) + set_inner_parsed(*other.inner_parsed_); + else + clear_inner_parsed(); + } + return *this; +} + +Parsed::~Parsed() { + delete inner_parsed_; +} + +int Parsed::Length() const { + if (ref.is_valid()) + return ref.end(); + return CountCharactersBefore(REF, false); +} + +int Parsed::CountCharactersBefore(ComponentType type, + bool include_delimiter) const { + if (type == SCHEME) + return scheme.begin; + + // There will be some characters after the scheme like "://" and we don't + // know how many. Search forwards for the next thing until we find one. + int cur = 0; + if (scheme.is_valid()) + cur = scheme.end() + 1; // Advance over the ':' at the end of the scheme. + + if (username.is_valid()) { + if (type <= USERNAME) + return username.begin; + cur = username.end() + 1; // Advance over the '@' or ':' at the end. + } + + if (password.is_valid()) { + if (type <= PASSWORD) + return password.begin; + cur = password.end() + 1; // Advance over the '@' at the end. + } + + if (host.is_valid()) { + if (type <= HOST) + return host.begin; + cur = host.end(); + } + + if (port.is_valid()) { + if (type < PORT || (type == PORT && include_delimiter)) + return port.begin - 1; // Back over delimiter. + if (type == PORT) + return port.begin; // Don't want delimiter counted. + cur = port.end(); + } + + if (path.is_valid()) { + if (type <= PATH) + return path.begin; + cur = path.end(); + } + + if (query.is_valid()) { + if (type < QUERY || (type == QUERY && include_delimiter)) + return query.begin - 1; // Back over delimiter. + if (type == QUERY) + return query.begin; // Don't want delimiter counted. + cur = query.end(); + } + + if (ref.is_valid()) { + if (type == REF && !include_delimiter) + return ref.begin; // Back over delimiter. + + // When there is a ref and we get here, the component we wanted was before + // this and not found, so we always know the beginning of the ref is right. + return ref.begin - 1; // Don't want delimiter counted. + } + + return cur; +} + +Component Parsed::GetContent() const { + const int begin = CountCharactersBefore(USERNAME, false); + const int len = Length() - begin; + // For compatability with the standard URL parser, we treat no content as + // -1, rather than having a length of 0 (we normally wouldn't care so + // much for these non-standard URLs). + return len ? Component(begin, len) : Component(); +} + +bool ExtractScheme(const char* url, int url_len, Component* scheme) { + return DoExtractScheme(url, url_len, scheme); +} + +bool ExtractScheme(const gurl_base::char16* url, int url_len, Component* scheme) { + return DoExtractScheme(url, url_len, scheme); +} + +// This handles everything that may be an authority terminator, including +// backslash. For special backslash handling see DoParseAfterScheme. +bool IsAuthorityTerminator(gurl_base::char16 ch) { + return IsURLSlash(ch) || ch == '?' || ch == '#'; +} + +void ExtractFileName(const char* url, + const Component& path, + Component* file_name) { + DoExtractFileName(url, path, file_name); +} + +void ExtractFileName(const gurl_base::char16* url, + const Component& path, + Component* file_name) { + DoExtractFileName(url, path, file_name); +} + +bool ExtractQueryKeyValue(const char* url, + Component* query, + Component* key, + Component* value) { + return DoExtractQueryKeyValue(url, query, key, value); +} + +bool ExtractQueryKeyValue(const gurl_base::char16* url, + Component* query, + Component* key, + Component* value) { + return DoExtractQueryKeyValue(url, query, key, value); +} + +void ParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + DoParseAuthority(spec, auth, username, password, hostname, port_num); +} + +void ParseAuthority(const gurl_base::char16* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + DoParseAuthority(spec, auth, username, password, hostname, port_num); +} + +int ParsePort(const char* url, const Component& port) { + return DoParsePort(url, port); +} + +int ParsePort(const gurl_base::char16* url, const Component& port) { + return DoParsePort(url, port); +} + +void ParseStandardURL(const char* url, int url_len, Parsed* parsed) { + DoParseStandardURL(url, url_len, parsed); +} + +void ParseStandardURL(const gurl_base::char16* url, int url_len, Parsed* parsed) { + DoParseStandardURL(url, url_len, parsed); +} + +void ParsePathURL(const char* url, + int url_len, + bool trim_path_end, + Parsed* parsed) { + DoParsePathURL(url, url_len, trim_path_end, parsed); +} + +void ParsePathURL(const gurl_base::char16* url, + int url_len, + bool trim_path_end, + Parsed* parsed) { + DoParsePathURL(url, url_len, trim_path_end, parsed); +} + +void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) { + DoParseFileSystemURL(url, url_len, parsed); +} + +void ParseFileSystemURL(const gurl_base::char16* url, int url_len, Parsed* parsed) { + DoParseFileSystemURL(url, url_len, parsed); +} + +void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) { + DoParseMailtoURL(url, url_len, parsed); +} + +void ParseMailtoURL(const gurl_base::char16* url, int url_len, Parsed* parsed) { + DoParseMailtoURL(url, url_len, parsed); +} + +void ParsePathInternal(const char* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + ParsePath(spec, path, filepath, query, ref); +} + +void ParsePathInternal(const gurl_base::char16* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + ParsePath(spec, path, filepath, query, ref); +} + +void ParseAfterScheme(const char* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +void ParseAfterScheme(const gurl_base::char16* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +} // namespace url
diff --git a/url/third_party/mozilla/url_parse.h b/url/third_party/mozilla/url_parse.h new file mode 100644 index 0000000..54b2af2 --- /dev/null +++ b/url/third_party/mozilla/url_parse.h
@@ -0,0 +1,375 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ +#define URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ + +#include "polyfills/base/component_export.h" +#include "base/strings/string16.h" + +namespace url { + +// Component ------------------------------------------------------------------ + +// Represents a substring for URL parsing. +struct Component { + Component() : begin(0), len(-1) {} + + // Normal constructor: takes an offset and a length. + Component(int b, int l) : begin(b), len(l) {} + + int end() const { + return begin + len; + } + + // Returns true if this component is valid, meaning the length is given. Even + // valid components may be empty to record the fact that they exist. + bool is_valid() const { + return (len != -1); + } + + // Returns true if the given component is specified on false, the component + // is either empty or invalid. + bool is_nonempty() const { + return (len > 0); + } + + void reset() { + begin = 0; + len = -1; + } + + bool operator==(const Component& other) const { + return begin == other.begin && len == other.len; + } + + int begin; // Byte offset in the string of this component. + int len; // Will be -1 if the component is unspecified. +}; + +// Helper that returns a component created with the given begin and ending +// points. The ending point is non-inclusive. +inline Component MakeRange(int begin, int end) { + return Component(begin, end - begin); +} + +// Parsed --------------------------------------------------------------------- + +// A structure that holds the identified parts of an input URL. This structure +// does NOT store the URL itself. The caller will have to store the URL text +// and its corresponding Parsed structure separately. +// +// Typical usage would be: +// +// Parsed parsed; +// Component scheme; +// if (!ExtractScheme(url, url_len, &scheme)) +// return I_CAN_NOT_FIND_THE_SCHEME_DUDE; +// +// if (IsStandardScheme(url, scheme)) // Not provided by this component +// ParseStandardURL(url, url_len, &parsed); +// else if (IsFileURL(url, scheme)) // Not provided by this component +// ParseFileURL(url, url_len, &parsed); +// else +// ParsePathURL(url, url_len, &parsed); +// +struct COMPONENT_EXPORT(URL) Parsed { + // Identifies different components. + enum ComponentType { + SCHEME, + USERNAME, + PASSWORD, + HOST, + PORT, + PATH, + QUERY, + REF, + }; + + // The default constructor is sufficient for the components, but inner_parsed_ + // requires special handling. + Parsed(); + Parsed(const Parsed&); + Parsed& operator=(const Parsed&); + ~Parsed(); + + // Returns the length of the URL (the end of the last component). + // + // Note that for some invalid, non-canonical URLs, this may not be the length + // of the string. For example "http://": the parsed structure will only + // contain an entry for the four-character scheme, and it doesn't know about + // the "://". For all other last-components, it will return the real length. + int Length() const; + + // Returns the number of characters before the given component if it exists, + // or where the component would be if it did exist. This will return the + // string length if the component would be appended to the end. + // + // Note that this can get a little funny for the port, query, and ref + // components which have a delimiter that is not counted as part of the + // component. The |include_delimiter| flag controls if you want this counted + // as part of the component or not when the component exists. + // + // This example shows the difference between the two flags for two of these + // delimited components that is present (the port and query) and one that + // isn't (the reference). The components that this flag affects are marked + // with a *. + // 0 1 2 + // 012345678901234567890 + // Example input: http://foo:80/?query + // include_delim=true, ...=false ("<-" indicates different) + // SCHEME: 0 0 + // USERNAME: 5 5 + // PASSWORD: 5 5 + // HOST: 7 7 + // *PORT: 10 11 <- + // PATH: 13 13 + // *QUERY: 14 15 <- + // *REF: 20 20 + // + int CountCharactersBefore(ComponentType type, bool include_delimiter) const; + + // Scheme without the colon: "http://foo"/ would have a scheme of "http". + // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there + // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed + // to start at the beginning of the string if there are preceeding whitespace + // or control characters. + Component scheme; + + // Username. Specified in URLs with an @ sign before the host. See |password| + Component username; + + // Password. The length will be -1 if unspecified, 0 if specified but empty. + // Not all URLs with a username have a password, as in "http://me@host/". + // The password is separated form the username with a colon, as in + // "http://me:secret@host/" + Component password; + + // Host name. + Component host; + + // Port number. + Component port; + + // Path, this is everything following the host name, stopping at the query of + // ref delimiter (if any). Length will be -1 if unspecified. This includes + // the preceeding slash, so the path on http://www.google.com/asdf" is + // "/asdf". As a result, it is impossible to have a 0 length path, it will + // be -1 in cases like "http://host?foo". + // Note that we treat backslashes the same as slashes. + Component path; + + // Stuff between the ? and the # after the path. This does not include the + // preceeding ? character. Length will be -1 if unspecified, 0 if there is + // a question mark but no query string. + Component query; + + // Indicated by a #, this is everything following the hash sign (not + // including it). If there are multiple hash signs, we'll use the last one. + // Length will be -1 if there is no hash sign, or 0 if there is one but + // nothing follows it. + Component ref; + + // The URL spec from the character after the scheme: until the end of the + // URL, regardless of the scheme. This is mostly useful for 'opaque' non- + // hierarchical schemes like data: and javascript: as a convient way to get + // the string with the scheme stripped off. + Component GetContent() const; + + // True if the URL's source contained a raw `<` character, and whitespace was + // removed from the URL during parsing + // + // TODO(mkwst): Link this to something in a spec if + // https://github.com/whatwg/url/pull/284 lands. + bool potentially_dangling_markup; + + // This is used for nested URL types, currently only filesystem. If you + // parse a filesystem URL, the resulting Parsed will have a nested + // inner_parsed_ to hold the parsed inner URL's component information. + // For all other url types [including the inner URL], it will be NULL. + Parsed* inner_parsed() const { + return inner_parsed_; + } + + void set_inner_parsed(const Parsed& inner_parsed) { + if (!inner_parsed_) + inner_parsed_ = new Parsed(inner_parsed); + else + *inner_parsed_ = inner_parsed; + } + + void clear_inner_parsed() { + if (inner_parsed_) { + delete inner_parsed_; + inner_parsed_ = NULL; + } + } + + private: + Parsed* inner_parsed_; // This object is owned and managed by this struct. +}; + +// Initialization functions --------------------------------------------------- +// +// These functions parse the given URL, filling in all of the structure's +// components. These functions can not fail, they will always do their best +// at interpreting the input given. +// +// The string length of the URL MUST be specified, we do not check for NULLs +// at any point in the process, and will actually handle embedded NULLs. +// +// IMPORTANT: These functions do NOT hang on to the given pointer or copy it +// in any way. See the comment above the struct. +// +// The 8-bit versions require UTF-8 encoding. + +// StandardURL is for when the scheme is known to be one that has an +// authority (host) like "http". This function will not handle weird ones +// like "about:" and "javascript:", or do the right thing for "file:" URLs. +COMPONENT_EXPORT(URL) +void ParseStandardURL(const char* url, int url_len, Parsed* parsed); +COMPONENT_EXPORT(URL) +void ParseStandardURL(const gurl_base::char16* url, int url_len, Parsed* parsed); + +// PathURL is for when the scheme is known not to have an authority (host) +// section but that aren't file URLs either. The scheme is parsed, and +// everything after the scheme is considered as the path. This is used for +// things like "about:" and "javascript:" +COMPONENT_EXPORT(URL) +void ParsePathURL(const char* url, + int url_len, + bool trim_path_end, + Parsed* parsed); +COMPONENT_EXPORT(URL) +void ParsePathURL(const gurl_base::char16* url, + int url_len, + bool trim_path_end, + Parsed* parsed); + +// FileURL is for file URLs. There are some special rules for interpreting +// these. +COMPONENT_EXPORT(URL) +void ParseFileURL(const char* url, int url_len, Parsed* parsed); +COMPONENT_EXPORT(URL) +void ParseFileURL(const gurl_base::char16* url, int url_len, Parsed* parsed); + +// Filesystem URLs are structured differently than other URLs. +COMPONENT_EXPORT(URL) +void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed); +COMPONENT_EXPORT(URL) +void ParseFileSystemURL(const gurl_base::char16* url, int url_len, Parsed* parsed); + +// MailtoURL is for mailto: urls. They are made up scheme,path,query +COMPONENT_EXPORT(URL) +void ParseMailtoURL(const char* url, int url_len, Parsed* parsed); +COMPONENT_EXPORT(URL) +void ParseMailtoURL(const gurl_base::char16* url, int url_len, Parsed* parsed); + +// Helper functions ----------------------------------------------------------- + +// Locates the scheme according to the URL parser's rules. This function is +// designed so the caller can find the scheme and call the correct Init* +// function according to their known scheme types. +// +// It also does not perform any validation on the scheme. +// +// This function will return true if the scheme is found and will put the +// scheme's range into *scheme. False means no scheme could be found. Note +// that a URL beginning with a colon has a scheme, but it is empty, so this +// function will return true but *scheme will = (0,0). +// +// The scheme is found by skipping spaces and control characters at the +// beginning, and taking everything from there to the first colon to be the +// scheme. The character at scheme.end() will be the colon (we may enhance +// this to handle full width colons or something, so don't count on the +// actual character value). The character at scheme.end()+1 will be the +// beginning of the rest of the URL, be it the authority or the path (or the +// end of the string). +// +// The 8-bit version requires UTF-8 encoding. +COMPONENT_EXPORT(URL) +bool ExtractScheme(const char* url, int url_len, Component* scheme); +COMPONENT_EXPORT(URL) +bool ExtractScheme(const gurl_base::char16* url, int url_len, Component* scheme); + +// Returns true if ch is a character that terminates the authority segment +// of a URL. +COMPONENT_EXPORT(URL) bool IsAuthorityTerminator(gurl_base::char16 ch); + +// Does a best effort parse of input |spec|, in range |auth|. If a particular +// component is not found, it will be set to invalid. +COMPONENT_EXPORT(URL) +void ParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); +COMPONENT_EXPORT(URL) +void ParseAuthority(const gurl_base::char16* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); + +// Computes the integer port value from the given port component. The port +// component should have been identified by one of the init functions on +// |Parsed| for the given input url. +// +// The return value will be a positive integer between 0 and 64K, or one of +// the two special values below. +enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 }; +COMPONENT_EXPORT(URL) int ParsePort(const char* url, const Component& port); +COMPONENT_EXPORT(URL) +int ParsePort(const gurl_base::char16* url, const Component& port); + +// Extracts the range of the file name in the given url. The path must +// already have been computed by the parse function, and the matching URL +// and extracted path are provided to this function. The filename is +// defined as being everything from the last slash/backslash of the path +// to the end of the path. +// +// The file name will be empty if the path is empty or there is nothing +// following the last slash. +// +// The 8-bit version requires UTF-8 encoding. +COMPONENT_EXPORT(URL) +void ExtractFileName(const char* url, + const Component& path, + Component* file_name); +COMPONENT_EXPORT(URL) +void ExtractFileName(const gurl_base::char16* url, + const Component& path, + Component* file_name); + +// Extract the first key/value from the range defined by |*query|. Updates +// |*query| to start at the end of the extracted key/value pair. This is +// designed for use in a loop: you can keep calling it with the same query +// object and it will iterate over all items in the query. +// +// Some key/value pairs may have the key, the value, or both be empty (for +// example, the query string "?&"). These will be returned. Note that an empty +// last parameter "foo.com?" or foo.com?a&" will not be returned, this case +// is the same as "done." +// +// The initial query component should not include the '?' (this is the default +// for parsed URLs). +// +// If no key/value are found |*key| and |*value| will be unchanged and it will +// return false. +COMPONENT_EXPORT(URL) +bool ExtractQueryKeyValue(const char* url, + Component* query, + Component* key, + Component* value); +COMPONENT_EXPORT(URL) +bool ExtractQueryKeyValue(const gurl_base::char16* url, + Component* query, + Component* key, + Component* value); + +} // namespace url + +#endif // URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_
diff --git a/url/url_canon.cc b/url/url_canon.cc new file mode 100644 index 0000000..1860234 --- /dev/null +++ b/url/url_canon.cc
@@ -0,0 +1,15 @@ +// Copyright 2017 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_canon.h" + +#include "polyfills/base/component_export.h" + +namespace url { + +template class EXPORT_TEMPLATE_DEFINE(COMPONENT_EXPORT(URL)) CanonOutputT<char>; +template class EXPORT_TEMPLATE_DEFINE(COMPONENT_EXPORT(URL)) + CanonOutputT<gurl_base::char16>; + +} // namespace url
diff --git a/url/url_canon.h b/url/url_canon.h new file mode 100644 index 0000000..7e4a0ee --- /dev/null +++ b/url/url_canon.h
@@ -0,0 +1,1013 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_CANON_H_ +#define URL_URL_CANON_H_ + +#include <stdlib.h> +#include <string.h> + +#include "polyfills/base/component_export.h" +#include "polyfills/base/export_template.h" +#include "base/strings/string16.h" +#include "url/third_party/mozilla/url_parse.h" + +namespace url { + +// Canonicalizer output ------------------------------------------------------- + +// Base class for the canonicalizer output, this maintains a buffer and +// supports simple resizing and append operations on it. +// +// It is VERY IMPORTANT that no virtual function calls be made on the common +// code path. We only have two virtual function calls, the destructor and a +// resize function that is called when the existing buffer is not big enough. +// The derived class is then in charge of setting up our buffer which we will +// manage. +template<typename T> +class CanonOutputT { + public: + CanonOutputT() : buffer_(NULL), buffer_len_(0), cur_len_(0) { + } + virtual ~CanonOutputT() { + } + + // Implemented to resize the buffer. This function should update the buffer + // pointer to point to the new buffer, and any old data up to |cur_len_| in + // the buffer must be copied over. + // + // The new size |sz| must be larger than buffer_len_. + virtual void Resize(int sz) = 0; + + // Accessor for returning a character at a given position. The input offset + // must be in the valid range. + inline T at(int offset) const { + return buffer_[offset]; + } + + // Sets the character at the given position. The given position MUST be less + // than the length(). + inline void set(int offset, T ch) { + buffer_[offset] = ch; + } + + // Returns the number of characters currently in the buffer. + inline int length() const { + return cur_len_; + } + + // Returns the current capacity of the buffer. The length() is the number of + // characters that have been declared to be written, but the capacity() is + // the number that can be written without reallocation. If the caller must + // write many characters at once, it can make sure there is enough capacity, + // write the data, then use set_size() to declare the new length(). + int capacity() const { + return buffer_len_; + } + + // Called by the user of this class to get the output. The output will NOT + // be NULL-terminated. Call length() to get the + // length. + const T* data() const { + return buffer_; + } + T* data() { + return buffer_; + } + + // Shortens the URL to the new length. Used for "backing up" when processing + // relative paths. This can also be used if an external function writes a lot + // of data to the buffer (when using the "Raw" version below) beyond the end, + // to declare the new length. + // + // This MUST NOT be used to expand the size of the buffer beyond capacity(). + void set_length(int new_len) { + cur_len_ = new_len; + } + + // This is the most performance critical function, since it is called for + // every character. + void push_back(T ch) { + // In VC2005, putting this common case first speeds up execution + // dramatically because this branch is predicted as taken. + if (cur_len_ < buffer_len_) { + buffer_[cur_len_] = ch; + cur_len_++; + return; + } + + // Grow the buffer to hold at least one more item. Hopefully we won't have + // to do this very often. + if (!Grow(1)) + return; + + // Actually do the insertion. + buffer_[cur_len_] = ch; + cur_len_++; + } + + // Appends the given string to the output. + void Append(const T* str, int str_len) { + if (cur_len_ + str_len > buffer_len_) { + if (!Grow(cur_len_ + str_len - buffer_len_)) + return; + } + for (int i = 0; i < str_len; i++) + buffer_[cur_len_ + i] = str[i]; + cur_len_ += str_len; + } + + void ReserveSizeIfNeeded(int estimated_size) { + // Reserve a bit extra to account for escaped chars. + if (estimated_size > buffer_len_) + Resize(estimated_size + 8); + } + + protected: + // Grows the given buffer so that it can fit at least |min_additional| + // characters. Returns true if the buffer could be resized, false on OOM. + bool Grow(int min_additional) { + static const int kMinBufferLen = 16; + int new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_; + do { + if (new_len >= (1 << 30)) // Prevent overflow below. + return false; + new_len *= 2; + } while (new_len < buffer_len_ + min_additional); + Resize(new_len); + return true; + } + + T* buffer_; + int buffer_len_; + + // Used characters in the buffer. + int cur_len_; +}; + +// Simple implementation of the CanonOutput using new[]. This class +// also supports a static buffer so if it is allocated on the stack, most +// URLs can be canonicalized with no heap allocations. +template<typename T, int fixed_capacity = 1024> +class RawCanonOutputT : public CanonOutputT<T> { + public: + RawCanonOutputT() : CanonOutputT<T>() { + this->buffer_ = fixed_buffer_; + this->buffer_len_ = fixed_capacity; + } + ~RawCanonOutputT() override { + if (this->buffer_ != fixed_buffer_) + delete[] this->buffer_; + } + + void Resize(int sz) override { + T* new_buf = new T[sz]; + memcpy(new_buf, this->buffer_, + sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz)); + if (this->buffer_ != fixed_buffer_) + delete[] this->buffer_; + this->buffer_ = new_buf; + this->buffer_len_ = sz; + } + + protected: + T fixed_buffer_[fixed_capacity]; +}; + +// Explicitely instantiate commonly used instatiations. +extern template class EXPORT_TEMPLATE_DECLARE(COMPONENT_EXPORT(URL)) + CanonOutputT<char>; +extern template class EXPORT_TEMPLATE_DECLARE(COMPONENT_EXPORT(URL)) + CanonOutputT<gurl_base::char16>; + +// Normally, all canonicalization output is in narrow characters. We support +// the templates so it can also be used internally if a wide buffer is +// required. +typedef CanonOutputT<char> CanonOutput; +typedef CanonOutputT<gurl_base::char16> CanonOutputW; + +template<int fixed_capacity> +class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {}; +template<int fixed_capacity> +class RawCanonOutputW : public RawCanonOutputT<gurl_base::char16, fixed_capacity> {}; + +// Character set converter ---------------------------------------------------- +// +// Converts query strings into a custom encoding. The embedder can supply an +// implementation of this class to interface with their own character set +// conversion libraries. +// +// Embedders will want to see the unit test for the ICU version. + +class COMPONENT_EXPORT(URL) CharsetConverter { + public: + CharsetConverter() {} + virtual ~CharsetConverter() {} + + // Converts the given input string from UTF-16 to whatever output format the + // converter supports. This is used only for the query encoding conversion, + // which does not fail. Instead, the converter should insert "invalid + // character" characters in the output for invalid sequences, and do the + // best it can. + // + // If the input contains a character not representable in the output + // character set, the converter should append the HTML entity sequence in + // decimal, (such as "你") with escaping of the ampersand, number + // sign, and semicolon (in the previous example it would be + // "%26%2320320%3B"). This rule is based on what IE does in this situation. + virtual void ConvertFromUTF16(const gurl_base::char16* input, + int input_len, + CanonOutput* output) = 0; +}; + +// Schemes -------------------------------------------------------------------- + +// Types of a scheme representing the requirements on the data represented by +// the authority component of a URL with the scheme. +enum SchemeType { + // The authority component of a URL with the scheme has the form + // "username:password@host:port". The username and password entries are + // optional; the host may not be empty. The default value of the port can be + // omitted in serialization. This type occurs with network schemes like http, + // https, and ftp. + SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, + // The authority component of a URL with the scheme has the form "host:port", + // and does not include username or password. The default value of the port + // can be omitted in serialization. Used by inner URLs of filesystem URLs of + // origins with network hosts, from which the username and password are + // stripped. + SCHEME_WITH_HOST_AND_PORT, + // The authority component of an URL with the scheme has the form "host", and + // does not include port, username, or password. Used when the hosts are not + // network addresses; for example, schemes used internally by the browser. + SCHEME_WITH_HOST, + // A URL with the scheme doesn't have the authority component. + SCHEME_WITHOUT_AUTHORITY, +}; + +// Whitespace ----------------------------------------------------------------- + +// Searches for whitespace that should be removed from the middle of URLs, and +// removes it. Removed whitespace are tabs and newlines, but NOT spaces. Spaces +// are preserved, which is what most browsers do. A pointer to the output will +// be returned, and the length of that output will be in |output_len|. +// +// This should be called before parsing if whitespace removal is desired (which +// it normally is when you are canonicalizing). +// +// If no whitespace is removed, this function will not use the buffer and will +// return a pointer to the input, to avoid the extra copy. If modification is +// required, the given |buffer| will be used and the returned pointer will +// point to the beginning of the buffer. +// +// Therefore, callers should not use the buffer, since it may actually be empty, +// use the computed pointer and |*output_len| instead. +// +// If |input| contained both removable whitespace and a raw `<` character, +// |potentially_dangling_markup| will be set to `true`. Otherwise, it will be +// left untouched. +COMPONENT_EXPORT(URL) +const char* RemoveURLWhitespace(const char* input, + int input_len, + CanonOutputT<char>* buffer, + int* output_len, + bool* potentially_dangling_markup); +COMPONENT_EXPORT(URL) +const gurl_base::char16* RemoveURLWhitespace(const gurl_base::char16* input, + int input_len, + CanonOutputT<gurl_base::char16>* buffer, + int* output_len, + bool* potentially_dangling_markup); + +// IDN ------------------------------------------------------------------------ + +// Converts the Unicode input representing a hostname to ASCII using IDN rules. +// The output must fall in the ASCII range, but will be encoded in UTF-16. +// +// On success, the output will be filled with the ASCII host name and it will +// return true. Unlike most other canonicalization functions, this assumes that +// the output is empty. The beginning of the host will be at offset 0, and +// the length of the output will be set to the length of the new host name. +// +// On error, returns false. The output in this case is undefined. +COMPONENT_EXPORT(URL) +bool IDNToASCII(const gurl_base::char16* src, int src_len, CanonOutputW* output); + +// Piece-by-piece canonicalizers ---------------------------------------------- +// +// These individual canonicalizers append the canonicalized versions of the +// corresponding URL component to the given std::string. The spec and the +// previously-identified range of that component are the input. The range of +// the canonicalized component will be written to the output component. +// +// These functions all append to the output so they can be chained. Make sure +// the output is empty when you start. +// +// These functions returns boolean values indicating success. On failure, they +// will attempt to write something reasonable to the output so that, if +// displayed to the user, they will recognise it as something that's messed up. +// Nothing more should ever be done with these invalid URLs, however. + +// Scheme: Appends the scheme and colon to the URL. The output component will +// indicate the range of characters up to but not including the colon. +// +// Canonical URLs always have a scheme. If the scheme is not present in the +// input, this will just write the colon to indicate an empty scheme. Does not +// append slashes which will be needed before any authority components for most +// URLs. +// +// The 8-bit version requires UTF-8 encoding. +COMPONENT_EXPORT(URL) +bool CanonicalizeScheme(const char* spec, + const Component& scheme, + CanonOutput* output, + Component* out_scheme); +COMPONENT_EXPORT(URL) +bool CanonicalizeScheme(const gurl_base::char16* spec, + const Component& scheme, + CanonOutput* output, + Component* out_scheme); + +// User info: username/password. If present, this will add the delimiters so +// the output will be "<username>:<password>@" or "<username>@". Empty +// username/password pairs, or empty passwords, will get converted to +// nonexistent in the canonical version. +// +// The components for the username and password refer to ranges in the +// respective source strings. Usually, these will be the same string, which +// is legal as long as the two components don't overlap. +// +// The 8-bit version requires UTF-8 encoding. +COMPONENT_EXPORT(URL) +bool CanonicalizeUserInfo(const char* username_source, + const Component& username, + const char* password_source, + const Component& password, + CanonOutput* output, + Component* out_username, + Component* out_password); +COMPONENT_EXPORT(URL) +bool CanonicalizeUserInfo(const gurl_base::char16* username_source, + const Component& username, + const gurl_base::char16* password_source, + const Component& password, + CanonOutput* output, + Component* out_username, + Component* out_password); + +// This structure holds detailed state exported from the IP/Host canonicalizers. +// Additional fields may be added as callers require them. +struct CanonHostInfo { + CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} + + // Convenience function to test if family is an IP address. + bool IsIPAddress() const { return family == IPV4 || family == IPV6; } + + // This field summarizes how the input was classified by the canonicalizer. + enum Family { + NEUTRAL, // - Doesn't resemble an IP address. As far as the IP + // canonicalizer is concerned, it should be treated as a + // hostname. + BROKEN, // - Almost an IP, but was not canonicalized. This could be an + // IPv4 address where truncation occurred, or something + // containing the special characters :[] which did not parse + // as an IPv6 address. Never attempt to connect to this + // address, because it might actually succeed! + IPV4, // - Successfully canonicalized as an IPv4 address. + IPV6, // - Successfully canonicalized as an IPv6 address. + }; + Family family; + + // If |family| is IPV4, then this is the number of nonempty dot-separated + // components in the input text, from 1 to 4. If |family| is not IPV4, + // this value is undefined. + int num_ipv4_components; + + // Location of host within the canonicalized output. + // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6. + // CanonicalizeHostVerbose() always sets it. + Component out_host; + + // |address| contains the parsed IP Address (if any) in its first + // AddressLength() bytes, in network order. If IsIPAddress() is false + // AddressLength() will return zero and the content of |address| is undefined. + unsigned char address[16]; + + // Convenience function to calculate the length of an IP address corresponding + // to the current IP version in |family|, if any. For use with |address|. + int AddressLength() const { + return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0); + } +}; + + +// Host. +// +// The 8-bit version requires UTF-8 encoding. Use this version when you only +// need to know whether canonicalization succeeded. +COMPONENT_EXPORT(URL) +bool CanonicalizeHost(const char* spec, + const Component& host, + CanonOutput* output, + Component* out_host); +COMPONENT_EXPORT(URL) +bool CanonicalizeHost(const gurl_base::char16* spec, + const Component& host, + CanonOutput* output, + Component* out_host); + +// Extended version of CanonicalizeHost, which returns additional information. +// Use this when you need to know whether the hostname was an IP address. +// A successful return is indicated by host_info->family != BROKEN. See the +// definition of CanonHostInfo above for details. +COMPONENT_EXPORT(URL) +void CanonicalizeHostVerbose(const char* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info); +COMPONENT_EXPORT(URL) +void CanonicalizeHostVerbose(const gurl_base::char16* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info); + +// Canonicalizes a string according to the host canonicalization rules. Unlike +// CanonicalizeHost, this will not check for IP addresses which can change the +// meaning (and canonicalization) of the components. This means it is possible +// to call this for sub-components of a host name without corruption. +// +// As an example, "01.02.03.04.com" is a canonical hostname. If you called +// CanonicalizeHost on the substring "01.02.03.04" it will get "fixed" to +// "1.2.3.4" which will produce an invalid host name when reassembled. This +// can happen more than one might think because all numbers by themselves are +// considered IP addresses; so "5" canonicalizes to "0.0.0.5". +// +// Be careful: Because Punycode works on each dot-separated substring as a +// unit, you should only pass this function substrings that represent complete +// dot-separated subcomponents of the original host. Even if you have ASCII +// input, percent-escaped characters will have different meanings if split in +// the middle. +// +// Returns true if the host was valid. This function will treat a 0-length +// host as valid (because it's designed to be used for substrings) while the +// full version above will mark empty hosts as broken. +COMPONENT_EXPORT(URL) +bool CanonicalizeHostSubstring(const char* spec, + const Component& host, + CanonOutput* output); +COMPONENT_EXPORT(URL) +bool CanonicalizeHostSubstring(const gurl_base::char16* spec, + const Component& host, + CanonOutput* output); + +// IP addresses. +// +// Tries to interpret the given host name as an IPv4 or IPv6 address. If it is +// an IP address, it will canonicalize it as such, appending it to |output|. +// Additional status information is returned via the |*host_info| parameter. +// See the definition of CanonHostInfo above for details. +// +// This is called AUTOMATICALLY from the host canonicalizer, which ensures that +// the input is unescaped and name-prepped, etc. It should not normally be +// necessary or wise to call this directly. +COMPONENT_EXPORT(URL) +void CanonicalizeIPAddress(const char* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info); +COMPONENT_EXPORT(URL) +void CanonicalizeIPAddress(const gurl_base::char16* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info); + +// Port: this function will add the colon for the port if a port is present. +// The caller can pass PORT_UNSPECIFIED as the +// default_port_for_scheme argument if there is no default port. +// +// The 8-bit version requires UTF-8 encoding. +COMPONENT_EXPORT(URL) +bool CanonicalizePort(const char* spec, + const Component& port, + int default_port_for_scheme, + CanonOutput* output, + Component* out_port); +COMPONENT_EXPORT(URL) +bool CanonicalizePort(const gurl_base::char16* spec, + const Component& port, + int default_port_for_scheme, + CanonOutput* output, + Component* out_port); + +// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED +// if the scheme is unknown. +COMPONENT_EXPORT(URL) +int DefaultPortForScheme(const char* scheme, int scheme_len); + +// Path. If the input does not begin in a slash (including if the input is +// empty), we'll prepend a slash to the path to make it canonical. +// +// The 8-bit version assumes UTF-8 encoding, but does not verify the validity +// of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid +// characters, etc.). Normally, URLs will come in as UTF-16, so this isn't +// an issue. Somebody giving us an 8-bit path is responsible for generating +// the path that the server expects (we'll escape high-bit characters), so +// if something is invalid, it's their problem. +COMPONENT_EXPORT(URL) +bool CanonicalizePath(const char* spec, + const Component& path, + CanonOutput* output, + Component* out_path); +COMPONENT_EXPORT(URL) +bool CanonicalizePath(const gurl_base::char16* spec, + const Component& path, + CanonOutput* output, + Component* out_path); + +// Canonicalizes the input as a file path. This is like CanonicalizePath except +// that it also handles Windows drive specs. For example, the path can begin +// with "c|\" and it will get properly canonicalized to "C:/". +// The string will be appended to |*output| and |*out_path| will be updated. +// +// The 8-bit version requires UTF-8 encoding. +COMPONENT_EXPORT(URL) +bool FileCanonicalizePath(const char* spec, + const Component& path, + CanonOutput* output, + Component* out_path); +COMPONENT_EXPORT(URL) +bool FileCanonicalizePath(const gurl_base::char16* spec, + const Component& path, + CanonOutput* output, + Component* out_path); + +// Query: Prepends the ? if needed. +// +// The 8-bit version requires the input to be UTF-8 encoding. Incorrectly +// encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode +// "invalid character." This function can not fail, we always just try to do +// our best for crazy input here since web pages can set it themselves. +// +// This will convert the given input into the output encoding that the given +// character set converter object provides. The converter will only be called +// if necessary, for ASCII input, no conversions are necessary. +// +// The converter can be NULL. In this case, the output encoding will be UTF-8. +COMPONENT_EXPORT(URL) +void CanonicalizeQuery(const char* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output, + Component* out_query); +COMPONENT_EXPORT(URL) +void CanonicalizeQuery(const gurl_base::char16* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output, + Component* out_query); + +// Ref: Prepends the # if needed. The output will be UTF-8 (this is the only +// canonicalizer that does not produce ASCII output). The output is +// guaranteed to be valid UTF-8. +// +// This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use +// the "Unicode replacement character" for the confusing bits and copy the rest. +COMPONENT_EXPORT(URL) +void CanonicalizeRef(const char* spec, + const Component& path, + CanonOutput* output, + Component* out_path); +COMPONENT_EXPORT(URL) +void CanonicalizeRef(const gurl_base::char16* spec, + const Component& path, + CanonOutput* output, + Component* out_path); + +// Full canonicalizer --------------------------------------------------------- +// +// These functions replace any string contents, rather than append as above. +// See the above piece-by-piece functions for information specific to +// canonicalizing individual components. +// +// The output will be ASCII except the reference fragment, which may be UTF-8. +// +// The 8-bit versions require UTF-8 encoding. + +// Use for standard URLs with authorities and paths. +COMPONENT_EXPORT(URL) +bool CanonicalizeStandardURL(const char* spec, + int spec_len, + const Parsed& parsed, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool CanonicalizeStandardURL(const gurl_base::char16* spec, + int spec_len, + const Parsed& parsed, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); + +// Use for file URLs. +COMPONENT_EXPORT(URL) +bool CanonicalizeFileURL(const char* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool CanonicalizeFileURL(const gurl_base::char16* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); + +// Use for filesystem URLs. +COMPONENT_EXPORT(URL) +bool CanonicalizeFileSystemURL(const char* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool CanonicalizeFileSystemURL(const gurl_base::char16* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); + +// Use for path URLs such as javascript. This does not modify the path in any +// way, for example, by escaping it. +COMPONENT_EXPORT(URL) +bool CanonicalizePathURL(const char* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool CanonicalizePathURL(const gurl_base::char16* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed); + +// Use for mailto URLs. This "canonicalizes" the URL into a path and query +// component. It does not attempt to merge "to" fields. It uses UTF-8 for +// the query encoding if there is a query. This is because a mailto URL is +// really intended for an external mail program, and the encoding of a page, +// etc. which would influence a query encoding normally are irrelevant. +COMPONENT_EXPORT(URL) +bool CanonicalizeMailtoURL(const char* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool CanonicalizeMailtoURL(const gurl_base::char16* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed); + +// Part replacer -------------------------------------------------------------- + +// Internal structure used for storing separate strings for each component. +// The basic canonicalization functions use this structure internally so that +// component replacement (different strings for different components) can be +// treated on the same code path as regular canonicalization (the same string +// for each component). +// +// A Parsed structure usually goes along with this. Those components identify +// offsets within these strings, so that they can all be in the same string, +// or spread arbitrarily across different ones. +// +// This structures does not own any data. It is the caller's responsibility to +// ensure that the data the pointers point to stays in scope and is not +// modified. +template<typename CHAR> +struct URLComponentSource { + // Constructor normally used by callers wishing to replace components. This + // will make them all NULL, which is no replacement. The caller would then + // override the components they want to replace. + URLComponentSource() + : scheme(NULL), + username(NULL), + password(NULL), + host(NULL), + port(NULL), + path(NULL), + query(NULL), + ref(NULL) { + } + + // Constructor normally used internally to initialize all the components to + // point to the same spec. + explicit URLComponentSource(const CHAR* default_value) + : scheme(default_value), + username(default_value), + password(default_value), + host(default_value), + port(default_value), + path(default_value), + query(default_value), + ref(default_value) { + } + + const CHAR* scheme; + const CHAR* username; + const CHAR* password; + const CHAR* host; + const CHAR* port; + const CHAR* path; + const CHAR* query; + const CHAR* ref; +}; + +// This structure encapsulates information on modifying a URL. Each component +// may either be left unchanged, replaced, or deleted. +// +// By default, each component is unchanged. For those components that should be +// modified, call either Set* or Clear* to modify it. +// +// The string passed to Set* functions DOES NOT GET COPIED AND MUST BE KEPT +// IN SCOPE BY THE CALLER for as long as this object exists! +// +// Prefer the 8-bit replacement version if possible since it is more efficient. +template<typename CHAR> +class Replacements { + public: + Replacements() { + } + + // Scheme + void SetScheme(const CHAR* s, const Component& comp) { + sources_.scheme = s; + components_.scheme = comp; + } + // Note: we don't have a ClearScheme since this doesn't make any sense. + bool IsSchemeOverridden() const { return sources_.scheme != NULL; } + + // Username + void SetUsername(const CHAR* s, const Component& comp) { + sources_.username = s; + components_.username = comp; + } + void ClearUsername() { + sources_.username = Placeholder(); + components_.username = Component(); + } + bool IsUsernameOverridden() const { return sources_.username != NULL; } + + // Password + void SetPassword(const CHAR* s, const Component& comp) { + sources_.password = s; + components_.password = comp; + } + void ClearPassword() { + sources_.password = Placeholder(); + components_.password = Component(); + } + bool IsPasswordOverridden() const { return sources_.password != NULL; } + + // Host + void SetHost(const CHAR* s, const Component& comp) { + sources_.host = s; + components_.host = comp; + } + void ClearHost() { + sources_.host = Placeholder(); + components_.host = Component(); + } + bool IsHostOverridden() const { return sources_.host != NULL; } + + // Port + void SetPort(const CHAR* s, const Component& comp) { + sources_.port = s; + components_.port = comp; + } + void ClearPort() { + sources_.port = Placeholder(); + components_.port = Component(); + } + bool IsPortOverridden() const { return sources_.port != NULL; } + + // Path + void SetPath(const CHAR* s, const Component& comp) { + sources_.path = s; + components_.path = comp; + } + void ClearPath() { + sources_.path = Placeholder(); + components_.path = Component(); + } + bool IsPathOverridden() const { return sources_.path != NULL; } + + // Query + void SetQuery(const CHAR* s, const Component& comp) { + sources_.query = s; + components_.query = comp; + } + void ClearQuery() { + sources_.query = Placeholder(); + components_.query = Component(); + } + bool IsQueryOverridden() const { return sources_.query != NULL; } + + // Ref + void SetRef(const CHAR* s, const Component& comp) { + sources_.ref = s; + components_.ref = comp; + } + void ClearRef() { + sources_.ref = Placeholder(); + components_.ref = Component(); + } + bool IsRefOverridden() const { return sources_.ref != NULL; } + + // Getters for the internal data. See the variables below for how the + // information is encoded. + const URLComponentSource<CHAR>& sources() const { return sources_; } + const Parsed& components() const { return components_; } + + private: + // Returns a pointer to a static empty string that is used as a placeholder + // to indicate a component should be deleted (see below). + const CHAR* Placeholder() { + static const CHAR empty_cstr = 0; + return &empty_cstr; + } + + // We support three states: + // + // Action | Source Component + // -----------------------+-------------------------------------------------- + // Don't change component | NULL (unused) + // Replace component | (replacement string) (replacement component) + // Delete component | (non-NULL) (invalid component: (0,-1)) + // + // We use a pointer to the empty string for the source when the component + // should be deleted. + URLComponentSource<CHAR> sources_; + Parsed components_; +}; + +// The base must be an 8-bit canonical URL. +COMPONENT_EXPORT(URL) +bool ReplaceStandardURL(const char* base, + const Parsed& base_parsed, + const Replacements<char>& replacements, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool ReplaceStandardURL(const char* base, + const Parsed& base_parsed, + const Replacements<gurl_base::char16>& replacements, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); + +// Filesystem URLs can only have the path, query, or ref replaced. +// All other components will be ignored. +COMPONENT_EXPORT(URL) +bool ReplaceFileSystemURL(const char* base, + const Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool ReplaceFileSystemURL(const char* base, + const Parsed& base_parsed, + const Replacements<gurl_base::char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); + +// Replacing some parts of a file URL is not permitted. Everything except +// the host, path, query, and ref will be ignored. +COMPONENT_EXPORT(URL) +bool ReplaceFileURL(const char* base, + const Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool ReplaceFileURL(const char* base, + const Parsed& base_parsed, + const Replacements<gurl_base::char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); + +// Path URLs can only have the scheme and path replaced. All other components +// will be ignored. +COMPONENT_EXPORT(URL) +bool ReplacePathURL(const char* base, + const Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool ReplacePathURL(const char* base, + const Parsed& base_parsed, + const Replacements<gurl_base::char16>& replacements, + CanonOutput* output, + Parsed* new_parsed); + +// Mailto URLs can only have the scheme, path, and query replaced. +// All other components will be ignored. +COMPONENT_EXPORT(URL) +bool ReplaceMailtoURL(const char* base, + const Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool ReplaceMailtoURL(const char* base, + const Parsed& base_parsed, + const Replacements<gurl_base::char16>& replacements, + CanonOutput* output, + Parsed* new_parsed); + +// Relative URL --------------------------------------------------------------- + +// Given an input URL or URL fragment |fragment|, determines if it is a +// relative or absolute URL and places the result into |*is_relative|. If it is +// relative, the relevant portion of the URL will be placed into +// |*relative_component| (there may have been trimmed whitespace, for example). +// This value is passed to ResolveRelativeURL. If the input is not relative, +// this value is UNDEFINED (it may be changed by the function). +// +// Returns true on success (we successfully determined the URL is relative or +// not). Failure means that the combination of URLs doesn't make any sense. +// +// The base URL should always be canonical, therefore is ASCII. +COMPONENT_EXPORT(URL) +bool IsRelativeURL(const char* base, + const Parsed& base_parsed, + const char* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + Component* relative_component); +COMPONENT_EXPORT(URL) +bool IsRelativeURL(const char* base, + const Parsed& base_parsed, + const gurl_base::char16* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + Component* relative_component); + +// Given a canonical parsed source URL, a URL fragment known to be relative, +// and the identified relevant portion of the relative URL (computed by +// IsRelativeURL), this produces a new parsed canonical URL in |output| and +// |out_parsed|. +// +// It also requires a flag indicating whether the base URL is a file: URL +// which triggers additional logic. +// +// The base URL should be canonical and have a host (may be empty for file +// URLs) and a path. If it doesn't have these, we can't resolve relative +// URLs off of it and will return the base as the output with an error flag. +// Because it is canonical is should also be ASCII. +// +// The query charset converter follows the same rules as CanonicalizeQuery. +// +// Returns true on success. On failure, the output will be "something +// reasonable" that will be consistent and valid, just probably not what +// was intended by the web page author or caller. +COMPONENT_EXPORT(URL) +bool ResolveRelativeURL(const char* base_url, + const Parsed& base_parsed, + bool base_is_file, + const char* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed); +COMPONENT_EXPORT(URL) +bool ResolveRelativeURL(const char* base_url, + const Parsed& base_parsed, + bool base_is_file, + const gurl_base::char16* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed); + +} // namespace url + +#endif // URL_URL_CANON_H_
diff --git a/url/url_canon_etc.cc b/url/url_canon_etc.cc new file mode 100644 index 0000000..23d1235 --- /dev/null +++ b/url/url_canon_etc.cc
@@ -0,0 +1,419 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Canonicalizers for random bits that aren't big enough for their own files. + +#include <string.h> + +#include "url/url_canon.h" +#include "url/url_canon_internal.h" + +namespace url { + +namespace { + +// Returns true if the given character should be removed from the middle of a +// URL. +inline bool IsRemovableURLWhitespace(int ch) { + return ch == '\r' || ch == '\n' || ch == '\t'; +} + +// Backend for RemoveURLWhitespace (see declaration in url_canon.h). +// It sucks that we have to do this, since this takes about 13% of the total URL +// canonicalization time. +template <typename CHAR> +const CHAR* DoRemoveURLWhitespace(const CHAR* input, + int input_len, + CanonOutputT<CHAR>* buffer, + int* output_len, + bool* potentially_dangling_markup) { + // Fast verification that there's nothing that needs removal. This is the 99% + // case, so we want it to be fast and don't care about impacting the speed + // when we do find whitespace. + int found_whitespace = false; + for (int i = 0; i < input_len; i++) { + if (!IsRemovableURLWhitespace(input[i])) + continue; + found_whitespace = true; + break; + } + + if (!found_whitespace) { + // Didn't find any whitespace, we don't need to do anything. We can just + // return the input as the output. + *output_len = input_len; + return input; + } + + // Skip whitespace removal for `data:` URLs. + // + // TODO(mkwst): Ideally, this would use something like `gurl_base::StartsWith`, but + // that turns out to be difficult to do correctly given this function's + // character type templating. + if (input_len > 5 && input[0] == 'd' && input[1] == 'a' && input[2] == 't' && + input[3] == 'a' && input[4] == ':') { + *output_len = input_len; + return input; + } + + // Remove the whitespace into the new buffer and return it. + for (int i = 0; i < input_len; i++) { + if (!IsRemovableURLWhitespace(input[i])) { + if (potentially_dangling_markup && input[i] == 0x3C) + *potentially_dangling_markup = true; + buffer->push_back(input[i]); + } + } + *output_len = buffer->length(); + return buffer->data(); +} + +// Contains the canonical version of each possible input letter in the scheme +// (basically, lower-cased). The corresponding entry will be 0 if the letter +// is not allowed in a scheme. +const char kSchemeCanonical[0x80] = { +// 00-1f: all are invalid + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// ' ' ! " # $ % & ' ( ) * + , - . / + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0, +// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 , +// @ A B C D E F G H I J K L M N O + 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// P Q R S T U V W X Y Z [ \ ] ^ _ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0, +// ` a b c d e f g h i j k l m n o + 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// p q r s t u v w x y z { | } ~ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 }; + +// This could be a table lookup as well by setting the high bit for each +// valid character, but it's only called once per URL, and it makes the lookup +// table easier to read not having extra stuff in it. +inline bool IsSchemeFirstChar(unsigned char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +template<typename CHAR, typename UCHAR> +bool DoScheme(const CHAR* spec, + const Component& scheme, + CanonOutput* output, + Component* out_scheme) { + if (scheme.len <= 0) { + // Scheme is unspecified or empty, convert to empty by appending a colon. + *out_scheme = Component(output->length(), 0); + output->push_back(':'); + return false; + } + + // The output scheme starts from the current position. + out_scheme->begin = output->length(); + + // Danger: it's important that this code does not strip any characters; + // it only emits the canonical version (be it valid or escaped) for each + // of the input characters. Stripping would put it out of sync with + // FindAndCompareScheme, which could cause some security checks on + // schemes to be incorrect. + bool success = true; + int end = scheme.end(); + for (int i = scheme.begin; i < end; i++) { + UCHAR ch = static_cast<UCHAR>(spec[i]); + char replacement = 0; + if (ch < 0x80) { + if (i == scheme.begin) { + // Need to do a special check for the first letter of the scheme. + if (IsSchemeFirstChar(static_cast<unsigned char>(ch))) + replacement = kSchemeCanonical[ch]; + } else { + replacement = kSchemeCanonical[ch]; + } + } + + if (replacement) { + output->push_back(replacement); + } else if (ch == '%') { + // Canonicalizing the scheme multiple times should lead to the same + // result. Since invalid characters will be escaped, we need to preserve + // the percent to avoid multiple escaping. The scheme will be invalid. + success = false; + output->push_back('%'); + } else { + // Invalid character, store it but mark this scheme as invalid. + success = false; + + // This will escape the output and also handle encoding issues. + // Ignore the return value since we already failed. + AppendUTF8EscapedChar(spec, &i, end, output); + } + } + + // The output scheme ends with the the current position, before appending + // the colon. + out_scheme->len = output->length() - out_scheme->begin; + output->push_back(':'); + return success; +} + +// The username and password components reference ranges in the corresponding +// *_spec strings. Typically, these specs will be the same (we're +// canonicalizing a single source string), but may be different when +// replacing components. +template<typename CHAR, typename UCHAR> +bool DoUserInfo(const CHAR* username_spec, + const Component& username, + const CHAR* password_spec, + const Component& password, + CanonOutput* output, + Component* out_username, + Component* out_password) { + if (username.len <= 0 && password.len <= 0) { + // Common case: no user info. We strip empty username/passwords. + *out_username = Component(); + *out_password = Component(); + return true; + } + + // Write the username. + out_username->begin = output->length(); + if (username.len > 0) { + // This will escape characters not valid for the username. + AppendStringOfType(&username_spec[username.begin], username.len, + CHAR_USERINFO, output); + } + out_username->len = output->length() - out_username->begin; + + // When there is a password, we need the separator. Note that we strip + // empty but specified passwords. + if (password.len > 0) { + output->push_back(':'); + out_password->begin = output->length(); + AppendStringOfType(&password_spec[password.begin], password.len, + CHAR_USERINFO, output); + out_password->len = output->length() - out_password->begin; + } else { + *out_password = Component(); + } + + output->push_back('@'); + return true; +} + +// Helper functions for converting port integers to strings. +inline void WritePortInt(char* output, int output_len, int port) { + _itoa_s(port, output, output_len, 10); +} + +// This function will prepend the colon if there will be a port. +template<typename CHAR, typename UCHAR> +bool DoPort(const CHAR* spec, + const Component& port, + int default_port_for_scheme, + CanonOutput* output, + Component* out_port) { + int port_num = ParsePort(spec, port); + if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) { + *out_port = Component(); + return true; // Leave port empty. + } + + if (port_num == PORT_INVALID) { + // Invalid port: We'll copy the text from the input so the user can see + // what the error was, and mark the URL as invalid by returning false. + output->push_back(':'); + out_port->begin = output->length(); + AppendInvalidNarrowString(spec, port.begin, port.end(), output); + out_port->len = output->length() - out_port->begin; + return false; + } + + // Convert port number back to an integer. Max port value is 5 digits, and + // the Parsed::ExtractPort will have made sure the integer is in range. + const int buf_size = 6; + char buf[buf_size]; + WritePortInt(buf, buf_size, port_num); + + // Append the port number to the output, preceded by a colon. + output->push_back(':'); + out_port->begin = output->length(); + for (int i = 0; i < buf_size && buf[i]; i++) + output->push_back(buf[i]); + + out_port->len = output->length() - out_port->begin; + return true; +} + +// clang-format off +// Percent-escape all "C0 controls" (0x00-0x1F) +// https://infra.spec.whatwg.org/#c0-control along with the characters ' ' +// (0x20), '"' (0x22), '<' (0x3C), '>' (0x3E), and '`' (0x60): +const bool kShouldEscapeCharInRef[0x80] = { +// Control characters (0x00-0x1F) + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, +// ' ' ! " # $ % & ' + true, false, true, false, false, false, false, false, +// ( ) * + , - . / + false, false, false, false, false, false, false, false, +// 0 1 2 3 4 5 6 7 + false, false, false, false, false, false, false, false, +// 8 9 : ; < = > ? + false, false, false, false, true, false, true, false, +// @ A B C D E F G + false, false, false, false, false, false, false, false, +// H I J K L M N O + false, false, false, false, false, false, false, false, +// P Q R S T U V W + false, false, false, false, false, false, false, false, +// X Y Z [ \ ] ^ _ + false, false, false, false, false, false, false, false, +// ` a b c d e f g + true, false, false, false, false, false, false, false, +// h i j k l m n o + false, false, false, false, false, false, false, false, +// p q r s t u v w + false, false, false, false, false, false, false, false, +// x y z { | } ~ + false, false, false, false, false, false, false +}; +// clang-format on + +template<typename CHAR, typename UCHAR> +void DoCanonicalizeRef(const CHAR* spec, + const Component& ref, + CanonOutput* output, + Component* out_ref) { + if (ref.len < 0) { + // Common case of no ref. + *out_ref = Component(); + return; + } + + // Append the ref separator. Note that we need to do this even when the ref + // is empty but present. + output->push_back('#'); + out_ref->begin = output->length(); + + // Now iterate through all the characters, converting to UTF-8 and validating. + int end = ref.end(); + for (int i = ref.begin; i < end; i++) { + if (spec[i] == 0) { + // IE just strips NULLs, so we do too. + continue; + } + + UCHAR current_char = static_cast<UCHAR>(spec[i]); + if (current_char < 0x80) { + if (kShouldEscapeCharInRef[current_char]) + AppendEscapedChar(static_cast<unsigned char>(spec[i]), output); + else + output->push_back(static_cast<char>(spec[i])); + } else { + AppendUTF8EscapedChar(spec, &i, end, output); + } + } + + out_ref->len = output->length() - out_ref->begin; +} + +} // namespace + +const char* RemoveURLWhitespace(const char* input, + int input_len, + CanonOutputT<char>* buffer, + int* output_len, + bool* potentially_dangling_markup) { + return DoRemoveURLWhitespace(input, input_len, buffer, output_len, + potentially_dangling_markup); +} + +const gurl_base::char16* RemoveURLWhitespace(const gurl_base::char16* input, + int input_len, + CanonOutputT<gurl_base::char16>* buffer, + int* output_len, + bool* potentially_dangling_markup) { + return DoRemoveURLWhitespace(input, input_len, buffer, output_len, + potentially_dangling_markup); +} + +char CanonicalSchemeChar(gurl_base::char16 ch) { + if (ch >= 0x80) + return 0; // Non-ASCII is not supported by schemes. + return kSchemeCanonical[ch]; +} + +bool CanonicalizeScheme(const char* spec, + const Component& scheme, + CanonOutput* output, + Component* out_scheme) { + return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme); +} + +bool CanonicalizeScheme(const gurl_base::char16* spec, + const Component& scheme, + CanonOutput* output, + Component* out_scheme) { + return DoScheme<gurl_base::char16, gurl_base::char16>(spec, scheme, output, out_scheme); +} + +bool CanonicalizeUserInfo(const char* username_source, + const Component& username, + const char* password_source, + const Component& password, + CanonOutput* output, + Component* out_username, + Component* out_password) { + return DoUserInfo<char, unsigned char>( + username_source, username, password_source, password, + output, out_username, out_password); +} + +bool CanonicalizeUserInfo(const gurl_base::char16* username_source, + const Component& username, + const gurl_base::char16* password_source, + const Component& password, + CanonOutput* output, + Component* out_username, + Component* out_password) { + return DoUserInfo<gurl_base::char16, gurl_base::char16>( + username_source, username, password_source, password, + output, out_username, out_password); +} + +bool CanonicalizePort(const char* spec, + const Component& port, + int default_port_for_scheme, + CanonOutput* output, + Component* out_port) { + return DoPort<char, unsigned char>(spec, port, + default_port_for_scheme, + output, out_port); +} + +bool CanonicalizePort(const gurl_base::char16* spec, + const Component& port, + int default_port_for_scheme, + CanonOutput* output, + Component* out_port) { + return DoPort<gurl_base::char16, gurl_base::char16>(spec, port, default_port_for_scheme, + output, out_port); +} + +void CanonicalizeRef(const char* spec, + const Component& ref, + CanonOutput* output, + Component* out_ref) { + DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref); +} + +void CanonicalizeRef(const gurl_base::char16* spec, + const Component& ref, + CanonOutput* output, + Component* out_ref) { + DoCanonicalizeRef<gurl_base::char16, gurl_base::char16>(spec, ref, output, out_ref); +} + +} // namespace url
diff --git a/url/url_canon_filesystemurl.cc b/url/url_canon_filesystemurl.cc new file mode 100644 index 0000000..9def892 --- /dev/null +++ b/url/url_canon_filesystemurl.cc
@@ -0,0 +1,135 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Functions for canonicalizing "filesystem:file:" URLs. + +#include "url/url_canon.h" +#include "url/url_canon_internal.h" +#include "url/url_file.h" +#include "url/url_parse_internal.h" +#include "url/url_util.h" +#include "url/url_util_internal.h" + +namespace url { + +namespace { + +// We use the URLComponentSource for the outer URL, as it can have replacements, +// whereas the inner_url can't, so it uses spec. +template<typename CHAR, typename UCHAR> +bool DoCanonicalizeFileSystemURL(const CHAR* spec, + const URLComponentSource<CHAR>& source, + const Parsed& parsed, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* new_parsed) { + // filesystem only uses {scheme, path, query, ref} -- clear the rest. + new_parsed->username.reset(); + new_parsed->password.reset(); + new_parsed->host.reset(); + new_parsed->port.reset(); + + const Parsed* inner_parsed = parsed.inner_parsed(); + Parsed new_inner_parsed; + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->Append("filesystem:", 11); + new_parsed->scheme.len = 10; + + if (!parsed.inner_parsed() || !parsed.inner_parsed()->scheme.is_valid()) + return false; + + bool success = true; + SchemeType inner_scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + if (CompareSchemeComponent(spec, inner_parsed->scheme, url::kFileScheme)) { + new_inner_parsed.scheme.begin = output->length(); + output->Append("file://", 7); + new_inner_parsed.scheme.len = 4; + success &= CanonicalizePath(spec, inner_parsed->path, output, + &new_inner_parsed.path); + } else if (GetStandardSchemeType(spec, inner_parsed->scheme, + &inner_scheme_type)) { + if (inner_scheme_type == SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION) { + // Strip out the user information from the inner URL, if any. + inner_scheme_type = SCHEME_WITH_HOST_AND_PORT; + } + success = CanonicalizeStandardURL( + spec, parsed.inner_parsed()->Length(), *parsed.inner_parsed(), + inner_scheme_type, charset_converter, output, &new_inner_parsed); + } else { + // TODO(ericu): The URL is wrong, but should we try to output more of what + // we were given? Echoing back filesystem:mailto etc. doesn't seem all that + // useful. + return false; + } + // The filesystem type must be more than just a leading slash for validity. + success &= parsed.inner_parsed()->path.len > 1; + + success &= CanonicalizePath(source.path, parsed.path, output, + &new_parsed->path); + + // Ignore failures for query/ref since the URL can probably still be loaded. + CanonicalizeQuery(source.query, parsed.query, charset_converter, + output, &new_parsed->query); + CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref); + if (success) + new_parsed->set_inner_parsed(new_inner_parsed); + + return success; +} + +} // namespace + +bool CanonicalizeFileSystemURL(const char* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeFileSystemURL<char, unsigned char>( + spec, URLComponentSource<char>(spec), parsed, charset_converter, output, + new_parsed); +} + +bool CanonicalizeFileSystemURL(const gurl_base::char16* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeFileSystemURL<gurl_base::char16, gurl_base::char16>( + spec, URLComponentSource<gurl_base::char16>(spec), parsed, charset_converter, + output, new_parsed); +} + +bool ReplaceFileSystemURL(const char* base, + const Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* new_parsed) { + URLComponentSource<char> source(base); + Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizeFileSystemURL<char, unsigned char>( + base, source, parsed, charset_converter, output, new_parsed); +} + +bool ReplaceFileSystemURL(const char* base, + const Parsed& base_parsed, + const Replacements<gurl_base::char16>& replacements, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource<char> source(base); + Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizeFileSystemURL<char, unsigned char>( + base, source, parsed, charset_converter, output, new_parsed); +} + +} // namespace url
diff --git a/url/url_canon_fileurl.cc b/url/url_canon_fileurl.cc new file mode 100644 index 0000000..ef654c7 --- /dev/null +++ b/url/url_canon_fileurl.cc
@@ -0,0 +1,190 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Functions for canonicalizing "file:" URLs. + +#include "base/strings/string_util.h" +#include "url/url_canon.h" +#include "url/url_canon_internal.h" +#include "url/url_file.h" +#include "url/url_parse_internal.h" + +namespace url { + +namespace { + +#ifdef WIN32 + +// Given a pointer into the spec, this copies and canonicalizes the drive +// letter and colon to the output, if one is found. If there is not a drive +// spec, it won't do anything. The index of the next character in the input +// spec is returned (after the colon when a drive spec is found, the begin +// offset if one is not). +template<typename CHAR> +int FileDoDriveSpec(const CHAR* spec, int begin, int end, + CanonOutput* output) { + // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo, + // (with backslashes instead of slashes as well). + int num_slashes = CountConsecutiveSlashes(spec, begin, end); + int after_slashes = begin + num_slashes; + + if (!DoesBeginWindowsDriveSpec(spec, after_slashes, end)) + return begin; // Haven't consumed any characters + + // A drive spec is the start of a path, so we need to add a slash for the + // authority terminator (typically the third slash). + output->push_back('/'); + + // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid + // and that it is followed by a colon/pipe. + + // Normalize Windows drive letters to uppercase + if (gurl_base::IsAsciiLower(spec[after_slashes])) + output->push_back(static_cast<char>(spec[after_slashes] - 'a' + 'A')); + else + output->push_back(static_cast<char>(spec[after_slashes])); + + // Normalize the character following it to a colon rather than pipe. + output->push_back(':'); + return after_slashes + 2; +} + +#endif // WIN32 + +template<typename CHAR, typename UCHAR> +bool DoFileCanonicalizePath(const CHAR* spec, + const Component& path, + CanonOutput* output, + Component* out_path) { + // Copies and normalizes the "c:" at the beginning, if present. + out_path->begin = output->length(); + int after_drive; +#ifdef WIN32 + after_drive = FileDoDriveSpec(spec, path.begin, path.end(), output); +#else + after_drive = path.begin; +#endif + + // Copies the rest of the path, starting from the slash following the + // drive colon (if any, Windows only), or the first slash of the path. + bool success = true; + if (after_drive < path.end()) { + // Use the regular path canonicalizer to canonicalize the rest of the + // path. Give it a fake output component to write into. DoCanonicalizeFile + // will compute the full path component. + Component sub_path = MakeRange(after_drive, path.end()); + Component fake_output_path; + success = CanonicalizePath(spec, sub_path, output, &fake_output_path); + } else { + // No input path, canonicalize to a slash. + output->push_back('/'); + } + + out_path->len = output->length() - out_path->begin; + return success; +} + +template<typename CHAR, typename UCHAR> +bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source, + const Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + // Things we don't set in file: URLs. + new_parsed->username = Component(); + new_parsed->password = Component(); + new_parsed->port = Component(); + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->Append("file://", 7); + new_parsed->scheme.len = 4; + + // Append the host. For many file URLs, this will be empty. For UNC, this + // will be present. + // TODO(brettw) This doesn't do any checking for host name validity. We + // should probably handle validity checking of UNC hosts differently than + // for regular IP hosts. + bool success = CanonicalizeHost(source.host, parsed.host, + output, &new_parsed->host); + success &= DoFileCanonicalizePath<CHAR, UCHAR>(source.path, parsed.path, + output, &new_parsed->path); + CanonicalizeQuery(source.query, parsed.query, query_converter, + output, &new_parsed->query); + + // Ignore failure for refs since the URL can probably still be loaded. + CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref); + + return success; +} + +} // namespace + +bool CanonicalizeFileURL(const char* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeFileURL<char, unsigned char>( + URLComponentSource<char>(spec), parsed, query_converter, + output, new_parsed); +} + +bool CanonicalizeFileURL(const gurl_base::char16* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeFileURL<gurl_base::char16, gurl_base::char16>( + URLComponentSource<gurl_base::char16>(spec), parsed, query_converter, + output, new_parsed); +} + +bool FileCanonicalizePath(const char* spec, + const Component& path, + CanonOutput* output, + Component* out_path) { + return DoFileCanonicalizePath<char, unsigned char>(spec, path, + output, out_path); +} + +bool FileCanonicalizePath(const gurl_base::char16* spec, + const Component& path, + CanonOutput* output, + Component* out_path) { + return DoFileCanonicalizePath<gurl_base::char16, gurl_base::char16>(spec, path, + output, out_path); +} + +bool ReplaceFileURL(const char* base, + const Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + URLComponentSource<char> source(base); + Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizeFileURL<char, unsigned char>( + source, parsed, query_converter, output, new_parsed); +} + +bool ReplaceFileURL(const char* base, + const Parsed& base_parsed, + const Replacements<gurl_base::char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource<char> source(base); + Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizeFileURL<char, unsigned char>( + source, parsed, query_converter, output, new_parsed); +} + +} // namespace url
diff --git a/url/url_canon_host.cc b/url/url_canon_host.cc new file mode 100644 index 0000000..f83dacb --- /dev/null +++ b/url/url_canon_host.cc
@@ -0,0 +1,430 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "polyfills/base/logging.h" +#include "url/url_canon.h" +#include "url/url_canon_internal.h" + +namespace url { + +namespace { + +// For reference, here's what IE supports: +// Key: 0 (disallowed: failure if present in the input) +// + (allowed either escaped or unescaped, and unmodified) +// U (allowed escaped or unescaped but always unescaped if present in +// escaped form) +// E (allowed escaped or unescaped but always escaped if present in +// unescaped form) +// % (only allowed escaped in the input, will be unmodified). +// I left blank alpha numeric characters. +// +// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f +// ----------------------------------------------- +// 0 0 E E E E E E E E E E E E E E E +// 1 E E E E E E E E E E E E E E E E +// 2 E + E E + E + + + + + + + U U 0 +// 3 % % E + E 0 <-- Those are : ; < = > ? +// 4 % +// 5 U 0 U U U <-- Those are [ \ ] ^ _ +// 6 E <-- That's ` +// 7 E E E U E <-- Those are { | } ~ (UNPRINTABLE) +// +// NOTE: I didn't actually test all the control characters. Some may be +// disallowed in the input, but they are all accepted escaped except for 0. +// I also didn't test if characters affecting HTML parsing are allowed +// unescaped, e.g. (") or (#), which would indicate the beginning of the path. +// Surprisingly, space is accepted in the input and always escaped. + +// This table lists the canonical version of all characters we allow in the +// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar +// value to indicate that this character should be escaped. We are a little more +// restrictive than IE, but less restrictive than Firefox. +// +// Note that we disallow the % character. We will allow it when part of an +// escape sequence, of course, but this disallows "%25". Even though IE allows +// it, allowing it would put us in a funny state. If there was an invalid +// escape sequence like "%zz", we'll add "%25zz" to the output and fail. +// Allowing percents means we'll succeed a second time, so validity would change +// based on how many times you run the canonicalizer. We prefer to always report +// the same vailidity, so reject this. +const unsigned char kEsc = 0xff; +const unsigned char kHostCharLookup[0x80] = { +// 00-1f: all are invalid + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// ' ' ! " # $ % & ' ( ) * + , - . / + kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0, +// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 , +// @ A B C D E F G H I J K L M N O + kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// P Q R S T U V W X Y Z [ \ ] ^ _ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_', +// ` a b c d e f g h i j k l m n o + kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// p q r s t u v w x y z { | } ~ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 }; + +// RFC1034 maximum FQDN length. +constexpr int kMaxHostLength = 253; + +// Generous padding to account for the fact that UTS#46 normalization can cause +// a long string to actually shrink and fit within the 253 character RFC1034 +// FQDN length limit. Note that this can still be too short for pathological +// cases: An arbitrary number of characters (e.g. U+00AD SOFT HYPHEN) can be +// removed from the input by UTS#46 processing. However, this should be +// sufficient for all normally-encountered, non-abusive hostname strings. +constexpr int kMaxHostBufferLength = kMaxHostLength*5; + +const int kTempHostBufferLen = 1024; +typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer; +typedef RawCanonOutputT<gurl_base::char16, kTempHostBufferLen> StackBufferW; + +// Scans a host name and fills in the output flags according to what we find. +// |has_non_ascii| will be true if there are any non-7-bit characters, and +// |has_escaped| will be true if there is a percent sign. +template<typename CHAR, typename UCHAR> +void ScanHostname(const CHAR* spec, + const Component& host, + bool* has_non_ascii, + bool* has_escaped) { + int end = host.end(); + *has_non_ascii = false; + *has_escaped = false; + for (int i = host.begin; i < end; i++) { + if (static_cast<UCHAR>(spec[i]) >= 0x80) + *has_non_ascii = true; + else if (spec[i] == '%') + *has_escaped = true; + } +} + +// Canonicalizes a host name that is entirely 8-bit characters (even though +// the type holding them may be 16 bits. Escaped characters will be unescaped. +// Non-7-bit characters (for example, UTF-8) will be passed unchanged. +// +// The |*has_non_ascii| flag will be true if there are non-7-bit characters in +// the output. +// +// This function is used in two situations: +// +// * When the caller knows there is no non-ASCII or percent escaped +// characters. This is what DoHost does. The result will be a completely +// canonicalized host since we know nothing weird can happen (escaped +// characters could be unescaped to non-7-bit, so they have to be treated +// with suspicion at this point). It does not use the |has_non_ascii| flag. +// +// * When the caller has an 8-bit string that may need unescaping. +// DoComplexHost calls us this situation to do unescaping and validation. +// After this, it may do other IDN operations depending on the value of the +// |*has_non_ascii| flag. +// +// The return value indicates if the output is a potentially valid host name. +template<typename INCHAR, typename OUTCHAR> +bool DoSimpleHost(const INCHAR* host, + int host_len, + CanonOutputT<OUTCHAR>* output, + bool* has_non_ascii) { + *has_non_ascii = false; + + bool success = true; + for (int i = 0; i < host_len; ++i) { + unsigned int source = host[i]; + if (source == '%') { + // Unescape first, if possible. + // Source will be used only if decode operation was successful. + if (!DecodeEscaped(host, &i, host_len, + reinterpret_cast<unsigned char*>(&source))) { + // Invalid escaped character. There is nothing that can make this + // host valid. We append an escaped percent so the URL looks reasonable + // and mark as failed. + AppendEscapedChar('%', output); + success = false; + continue; + } + } + + if (source < 0x80) { + // We have ASCII input, we can use our lookup table. + unsigned char replacement = kHostCharLookup[source]; + if (!replacement) { + // Invalid character, add it as percent-escaped and mark as failed. + AppendEscapedChar(source, output); + success = false; + } else if (replacement == kEsc) { + // This character is valid but should be escaped. + AppendEscapedChar(source, output); + } else { + // Common case, the given character is valid in a hostname, the lookup + // table tells us the canonical representation of that character (lower + // cased). + output->push_back(replacement); + } + } else { + // It's a non-ascii char. Just push it to the output. + // In case where we have char16 input, and char output it's safe to + // cast char16->char only if input string was converted to ASCII. + output->push_back(static_cast<OUTCHAR>(source)); + *has_non_ascii = true; + } + } + return success; +} + +// Canonicalizes a host that requires IDN conversion. Returns true on success +bool DoIDNHost(const gurl_base::char16* src, int src_len, CanonOutput* output) { + int original_output_len = output->length(); // So we can rewind below. + + // We need to escape URL before doing IDN conversion, since punicode strings + // cannot be escaped after they are created. + RawCanonOutputW<kTempHostBufferLen> url_escaped_host; + bool has_non_ascii; + DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii); + if (url_escaped_host.length() > kMaxHostBufferLength) { + AppendInvalidNarrowString(src, 0, src_len, output); + return false; + } + + StackBufferW wide_output; + if (!IDNToASCII(url_escaped_host.data(), + url_escaped_host.length(), + &wide_output)) { + // Some error, give up. This will write some reasonable looking + // representation of the string to the output. + AppendInvalidNarrowString(src, 0, src_len, output); + return false; + } + + // Now we check the ASCII output like a normal host. It will also handle + // unescaping. Although we unescaped everything before this function call, if + // somebody does %00 as fullwidth, ICU will convert this to ASCII. + bool success = DoSimpleHost(wide_output.data(), + wide_output.length(), + output, &has_non_ascii); + if (has_non_ascii) { + // ICU generated something that DoSimpleHost didn't think looked like + // ASCII. This is quite rare, but ICU might convert some characters to + // percent signs which might generate new escape sequences which might in + // turn be invalid. An example is U+FE6A "small percent" which ICU will + // name prep into an ASCII percent and then we can interpret the following + // characters as escaped characters. + // + // If DoSimpleHost didn't think the output was ASCII, just escape the + // thing we gave ICU and give up. DoSimpleHost will have handled a further + // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates + // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't + // do more (like handle escaped non-ASCII sequences). Handling the escaped + // ASCII isn't strictly necessary, but DoSimpleHost handles this case + // anyway so we handle it/ + output->set_length(original_output_len); + AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(), + output); + return false; + } + return success; +} + +// 8-bit convert host to its ASCII version: this converts the UTF-8 input to +// UTF-16. The has_escaped flag should be set if the input string requires +// unescaping. +bool DoComplexHost(const char* host, int host_len, + bool has_non_ascii, bool has_escaped, CanonOutput* output) { + // Save the current position in the output. We may write stuff and rewind it + // below, so we need to know where to rewind to. + int begin_length = output->length(); + + // Points to the UTF-8 data we want to convert. This will either be the + // input or the unescaped version written to |*output| if necessary. + const char* utf8_source; + int utf8_source_len; + if (has_escaped) { + // Unescape before converting to UTF-16 for IDN. We write this into the + // output because it most likely does not require IDNization, and we can + // save another huge stack buffer. It will be replaced below if it requires + // IDN. This will also update our non-ASCII flag so we know whether the + // unescaped input requires IDN. + if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) { + // Error with some escape sequence. We'll call the current output + // complete. DoSimpleHost will have written some "reasonable" output. + return false; + } + + // Unescaping may have left us with ASCII input, in which case the + // unescaped version we wrote to output is complete. + if (!has_non_ascii) { + return true; + } + + // Save the pointer into the data was just converted (it may be appended to + // other data in the output buffer). + utf8_source = &output->data()[begin_length]; + utf8_source_len = output->length() - begin_length; + } else { + // We don't need to unescape, use input for IDNization later. (We know the + // input has non-ASCII, or the simple version would have been called + // instead of us.) + utf8_source = host; + utf8_source_len = host_len; + } + + // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion. + // Above, we may have used the output to write the unescaped values to, so + // we have to rewind it to where we started after we convert it to UTF-16. + StackBufferW utf16; + if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) { + // In this error case, the input may or may not be the output. + StackBuffer utf8; + for (int i = 0; i < utf8_source_len; i++) + utf8.push_back(utf8_source[i]); + output->set_length(begin_length); + AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output); + return false; + } + output->set_length(begin_length); + + // This will call DoSimpleHost which will do normal ASCII canonicalization + // and also check for IP addresses in the outpt. + return DoIDNHost(utf16.data(), utf16.length(), output); +} + +// UTF-16 convert host to its ASCII version. The set up is already ready for +// the backend, so we just pass through. The has_escaped flag should be set if +// the input string requires unescaping. +bool DoComplexHost(const gurl_base::char16* host, int host_len, + bool has_non_ascii, bool has_escaped, CanonOutput* output) { + if (has_escaped) { + // Yikes, we have escaped characters with wide input. The escaped + // characters should be interpreted as UTF-8. To solve this problem, + // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN. + // + // We don't bother to optimize the conversion in the ASCII case (which + // *could* just be a copy) and use the UTF-8 path, because it should be + // very rare that host names have escaped characters, and it is relatively + // fast to do the conversion anyway. + StackBuffer utf8; + if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) { + AppendInvalidNarrowString(host, 0, host_len, output); + return false; + } + + // Once we convert to UTF-8, we can use the 8-bit version of the complex + // host handling code above. + return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii, + has_escaped, output); + } + + // No unescaping necessary, we can safely pass the input to ICU. This + // function will only get called if we either have escaped or non-ascii + // input, so it's safe to just use ICU now. Even if the input is ASCII, + // this function will do the right thing (just slower than we could). + return DoIDNHost(host, host_len, output); +} + +template <typename CHAR, typename UCHAR> +bool DoHostSubstring(const CHAR* spec, + const Component& host, + CanonOutput* output) { + bool has_non_ascii, has_escaped; + ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped); + + if (has_non_ascii || has_escaped) { + return DoComplexHost(&spec[host.begin], host.len, has_non_ascii, + has_escaped, output); + } + + const bool success = + DoSimpleHost(&spec[host.begin], host.len, output, &has_non_ascii); + GURL_DCHECK(!has_non_ascii); + return success; +} + +template <typename CHAR, typename UCHAR> +void DoHost(const CHAR* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + if (host.len <= 0) { + // Empty hosts don't need anything. + host_info->family = CanonHostInfo::NEUTRAL; + host_info->out_host = Component(); + return; + } + + // Keep track of output's initial length, so we can rewind later. + const int output_begin = output->length(); + + if (DoHostSubstring<CHAR, UCHAR>(spec, host, output)) { + // After all the other canonicalization, check if we ended up with an IP + // address. IP addresses are small, so writing into this temporary buffer + // should not cause an allocation. + RawCanonOutput<64> canon_ip; + CanonicalizeIPAddress(output->data(), + MakeRange(output_begin, output->length()), + &canon_ip, host_info); + + // If we got an IPv4/IPv6 address, copy the canonical form back to the + // real buffer. Otherwise, it's a hostname or broken IP, in which case + // we just leave it in place. + if (host_info->IsIPAddress()) { + output->set_length(output_begin); + output->Append(canon_ip.data(), canon_ip.length()); + } + } else { + // Canonicalization failed. Set BROKEN to notify the caller. + host_info->family = CanonHostInfo::BROKEN; + } + + host_info->out_host = MakeRange(output_begin, output->length()); +} + +} // namespace + +bool CanonicalizeHost(const char* spec, + const Component& host, + CanonOutput* output, + Component* out_host) { + CanonHostInfo host_info; + DoHost<char, unsigned char>(spec, host, output, &host_info); + *out_host = host_info.out_host; + return (host_info.family != CanonHostInfo::BROKEN); +} + +bool CanonicalizeHost(const gurl_base::char16* spec, + const Component& host, + CanonOutput* output, + Component* out_host) { + CanonHostInfo host_info; + DoHost<gurl_base::char16, gurl_base::char16>(spec, host, output, &host_info); + *out_host = host_info.out_host; + return (host_info.family != CanonHostInfo::BROKEN); +} + +void CanonicalizeHostVerbose(const char* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + DoHost<char, unsigned char>(spec, host, output, host_info); +} + +void CanonicalizeHostVerbose(const gurl_base::char16* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + DoHost<gurl_base::char16, gurl_base::char16>(spec, host, output, host_info); +} + +bool CanonicalizeHostSubstring(const char* spec, + const Component& host, + CanonOutput* output) { + return DoHostSubstring<char, unsigned char>(spec, host, output); +} + +bool CanonicalizeHostSubstring(const gurl_base::char16* spec, + const Component& host, + CanonOutput* output) { + return DoHostSubstring<gurl_base::char16, gurl_base::char16>(spec, host, output); +} + +} // namespace url
diff --git a/url/url_canon_icu.cc b/url/url_canon_icu.cc new file mode 100644 index 0000000..a9a32fd --- /dev/null +++ b/url/url_canon_icu.cc
@@ -0,0 +1,110 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// ICU-based character set converter. + +#include <stdint.h> +#include <stdlib.h> +#include <string.h> + +#include "polyfills/base/logging.h" +#include <unicode/ucnv.h> +#include <unicode/ucnv_cb.h> +#include <unicode/utypes.h> +#include "url/url_canon_icu.h" +#include "url/url_canon_internal.h" // for _itoa_s + +namespace url { + +namespace { + +// Called when converting a character that can not be represented, this will +// append an escaped version of the numerical character reference for that code +// point. It is of the form "Ӓ" and we will escape the non-digits to +// "%26%231234%3B". Why? This is what Netscape did back in the olden days. +void appendURLEscapedChar(const void* context, + UConverterFromUnicodeArgs* from_args, + const UChar* code_units, + int32_t length, + UChar32 code_point, + UConverterCallbackReason reason, + UErrorCode* err) { + if (reason == UCNV_UNASSIGNED) { + *err = U_ZERO_ERROR; + + const static int prefix_len = 6; + const static char prefix[prefix_len + 1] = "%26%23"; // "&#" percent-escaped + ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err); + + GURL_DCHECK(code_point < 0x110000); + char number[8]; // Max Unicode code point is 7 digits. + _itoa_s(code_point, number, 10); + int number_len = static_cast<int>(strlen(number)); + ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err); + + const static int postfix_len = 3; + const static char postfix[postfix_len + 1] = "%3B"; // ";" percent-escaped + ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err); + } +} + +// A class for scoping the installation of the invalid character callback. +class AppendHandlerInstaller { + public: + // The owner of this object must ensure that the converter is alive for the + // duration of this object's lifetime. + AppendHandlerInstaller(UConverter* converter) : converter_(converter) { + UErrorCode err = U_ZERO_ERROR; + ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0, + &old_callback_, &old_context_, &err); + } + + ~AppendHandlerInstaller() { + UErrorCode err = U_ZERO_ERROR; + ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err); + } + + private: + UConverter* converter_; + + UConverterFromUCallback old_callback_; + const void* old_context_; +}; + +} // namespace + +ICUCharsetConverter::ICUCharsetConverter(UConverter* converter) + : converter_(converter) { +} + +ICUCharsetConverter::~ICUCharsetConverter() = default; + +void ICUCharsetConverter::ConvertFromUTF16(const gurl_base::char16* input, + int input_len, + CanonOutput* output) { + // Install our error handler. It will be called for character that can not + // be represented in the destination character set. + AppendHandlerInstaller handler(converter_); + + int begin_offset = output->length(); + int dest_capacity = output->capacity() - begin_offset; + output->set_length(output->length()); + + do { + UErrorCode err = U_ZERO_ERROR; + char* dest = &output->data()[begin_offset]; + int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity, + input, input_len, &err); + if (err != U_BUFFER_OVERFLOW_ERROR) { + output->set_length(begin_offset + required_capacity); + return; + } + + // Output didn't fit, expand + dest_capacity = required_capacity; + output->Resize(begin_offset + dest_capacity); + } while (true); +} + +} // namespace url
diff --git a/url/url_canon_icu.h b/url/url_canon_icu.h new file mode 100644 index 0000000..33fc863 --- /dev/null +++ b/url/url_canon_icu.h
@@ -0,0 +1,40 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_CANON_ICU_H_ +#define URL_URL_CANON_ICU_H_ + +// ICU integration functions. + +#include "base/compiler_specific.h" +#include "polyfills/base/component_export.h" +#include "url/url_canon.h" + +typedef struct UConverter UConverter; + +namespace url { + +// An implementation of CharsetConverter that implementations can use to +// interface the canonicalizer with ICU's conversion routines. +class COMPONENT_EXPORT(URL) ICUCharsetConverter : public CharsetConverter { + public: + // Constructs a converter using an already-existing ICU character set + // converter. This converter is NOT owned by this object; the lifetime must + // be managed by the creator such that it is alive as long as this is. + ICUCharsetConverter(UConverter* converter); + + ~ICUCharsetConverter() override; + + void ConvertFromUTF16(const gurl_base::char16* input, + int input_len, + CanonOutput* output) override; + + private: + // The ICU converter, not owned by this class. + UConverter* converter_; +}; + +} // namespace url + +#endif // URL_URL_CANON_ICU_H_
diff --git a/url/url_canon_icu_unittest.cc b/url/url_canon_icu_unittest.cc new file mode 100644 index 0000000..55fd58f --- /dev/null +++ b/url/url_canon_icu_unittest.cc
@@ -0,0 +1,162 @@ +// Copyright 2014 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <stddef.h> + +#include "base/stl_util.h" +#include "testing/gtest/include/gtest/gtest.h" +#include <unicode/ucnv.h> +#include "url/url_canon.h" +#include "url/url_canon_icu.h" +#include "url/url_canon_stdstring.h" +#include "url/url_test_utils.h" + +namespace url { + +namespace { + +// Wrapper around a UConverter object that managers creation and destruction. +class UConvScoper { + public: + explicit UConvScoper(const char* charset_name) { + UErrorCode err = U_ZERO_ERROR; + converter_ = ucnv_open(charset_name, &err); + } + + ~UConvScoper() { + if (converter_) + ucnv_close(converter_); + } + + // Returns the converter object, may be NULL. + UConverter* converter() const { return converter_; } + + private: + UConverter* converter_; +}; + +TEST(URLCanonIcuTest, ICUCharsetConverter) { + struct ICUCase { + const wchar_t* input; + const char* encoding; + const char* expected; + } icu_cases[] = { + // UTF-8. + {L"Hello, world", "utf-8", "Hello, world"}, + {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"}, + // Non-BMP UTF-8. + {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"}, + // Big5 + {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"}, + // Unrepresentable character in the destination set. + {L"hello\x4f60\x06de\x597dworld", "big5", + "hello\xa7\x41%26%231758%3B\xa6\x6eworld"}, + }; + + for (size_t i = 0; i < gurl_base::size(icu_cases); i++) { + UConvScoper conv(icu_cases[i].encoding); + ASSERT_TRUE(conv.converter() != NULL); + ICUCharsetConverter converter(conv.converter()); + + std::string str; + StdStringCanonOutput output(&str); + + gurl_base::string16 input_str( + test_utils::TruncateWStringToUTF16(icu_cases[i].input)); + int input_len = static_cast<int>(input_str.length()); + converter.ConvertFromUTF16(input_str.c_str(), input_len, &output); + output.Complete(); + + EXPECT_STREQ(icu_cases[i].expected, str.c_str()); + } + + // Test string sizes around the resize boundary for the output to make sure + // the converter resizes as needed. + const int static_size = 16; + UConvScoper conv("utf-8"); + ASSERT_TRUE(conv.converter()); + ICUCharsetConverter converter(conv.converter()); + for (int i = static_size - 2; i <= static_size + 2; i++) { + // Make a string with the appropriate length. + gurl_base::string16 input; + for (int ch = 0; ch < i; ch++) + input.push_back('a'); + + RawCanonOutput<static_size> output; + converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()), + &output); + EXPECT_EQ(input.length(), static_cast<size_t>(output.length())); + } +} + +TEST(URLCanonIcuTest, QueryWithConverter) { + struct QueryCase { + const char* input8; + const wchar_t* input16; + const char* encoding; + const char* expected; + } query_cases[] = { + // Regular ASCII case in some different encodings. + {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"}, + {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"}, + {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"}, + // Chinese input/output + {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312", + "?q=%C4%E3%BA%C3"}, + {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"}, + // Unencodable character in the destination character set should be + // escaped. The escape sequence unescapes to be the entity name: + // "?q=你" + {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1", + "?q=Chinese%26%2365319%3B"}, + }; + + for (size_t i = 0; i < gurl_base::size(query_cases); i++) { + Component out_comp; + + UConvScoper conv(query_cases[i].encoding); + ASSERT_TRUE(!query_cases[i].encoding || conv.converter()); + ICUCharsetConverter converter(conv.converter()); + + if (query_cases[i].input8) { + int len = static_cast<int>(strlen(query_cases[i].input8)); + Component in_comp(0, len); + std::string out_str; + + StdStringCanonOutput output(&out_str); + CanonicalizeQuery(query_cases[i].input8, in_comp, &converter, &output, + &out_comp); + output.Complete(); + + EXPECT_EQ(query_cases[i].expected, out_str); + } + + if (query_cases[i].input16) { + gurl_base::string16 input16( + test_utils::TruncateWStringToUTF16(query_cases[i].input16)); + int len = static_cast<int>(input16.length()); + Component in_comp(0, len); + std::string out_str; + + StdStringCanonOutput output(&out_str); + CanonicalizeQuery(input16.c_str(), in_comp, &converter, &output, + &out_comp); + output.Complete(); + + EXPECT_EQ(query_cases[i].expected, out_str); + } + } + + // Extra test for input with embedded NULL; + std::string out_str; + StdStringCanonOutput output(&out_str); + Component out_comp; + CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp); + output.Complete(); + EXPECT_EQ("?a%20%00z%01", out_str); +} + +} // namespace + +} // namespace url
diff --git a/url/url_canon_internal.cc b/url/url_canon_internal.cc new file mode 100644 index 0000000..961c3b0 --- /dev/null +++ b/url/url_canon_internal.cc
@@ -0,0 +1,433 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_canon_internal.h" + +#include <errno.h> +#include <stddef.h> +#include <stdlib.h> + +#include <cstdio> +#include <string> + +#include "base/strings/utf_string_conversion_utils.h" + +namespace url { + +namespace { + +template<typename CHAR, typename UCHAR> +void DoAppendStringOfType(const CHAR* source, int length, + SharedCharTypes type, + CanonOutput* output) { + for (int i = 0; i < length; i++) { + if (static_cast<UCHAR>(source[i]) >= 0x80) { + // ReadChar will fill the code point with kUnicodeReplacementCharacter + // when the input is invalid, which is what we want. + unsigned code_point; + ReadUTFChar(source, &i, length, &code_point); + AppendUTF8EscapedValue(code_point, output); + } else { + // Just append the 7-bit character, possibly escaping it. + unsigned char uch = static_cast<unsigned char>(source[i]); + if (!IsCharOfType(uch, type)) + AppendEscapedChar(uch, output); + else + output->push_back(uch); + } + } +} + +// This function assumes the input values are all contained in 8-bit, +// although it allows any type. Returns true if input is valid, false if not. +template<typename CHAR, typename UCHAR> +void DoAppendInvalidNarrowString(const CHAR* spec, int begin, int end, + CanonOutput* output) { + for (int i = begin; i < end; i++) { + UCHAR uch = static_cast<UCHAR>(spec[i]); + if (uch >= 0x80) { + // Handle UTF-8/16 encodings. This call will correctly handle the error + // case by appending the invalid character. + AppendUTF8EscapedChar(spec, &i, end, output); + } else if (uch <= ' ' || uch == 0x7f) { + // This function is for error handling, so we escape all control + // characters and spaces, but not anything else since we lack + // context to do something more specific. + AppendEscapedChar(static_cast<unsigned char>(uch), output); + } else { + output->push_back(static_cast<char>(uch)); + } + } +} + +// Overrides one component, see the Replacements structure for +// what the various combionations of source pointer and component mean. +void DoOverrideComponent(const char* override_source, + const Component& override_component, + const char** dest, + Component* dest_component) { + if (override_source) { + *dest = override_source; + *dest_component = override_component; + } +} + +// Similar to DoOverrideComponent except that it takes a UTF-16 input and does +// not actually set the output character pointer. +// +// The input is converted to UTF-8 at the end of the given buffer as a temporary +// holding place. The component identifying the portion of the buffer used in +// the |utf8_buffer| will be specified in |*dest_component|. +// +// This will not actually set any |dest| pointer like DoOverrideComponent +// does because all of the pointers will point into the |utf8_buffer|, which +// may get resized while we're overriding a subsequent component. Instead, the +// caller should use the beginning of the |utf8_buffer| as the string pointer +// for all components once all overrides have been prepared. +bool PrepareUTF16OverrideComponent(const gurl_base::char16* override_source, + const Component& override_component, + CanonOutput* utf8_buffer, + Component* dest_component) { + bool success = true; + if (override_source) { + if (!override_component.is_valid()) { + // Non-"valid" component (means delete), so we need to preserve that. + *dest_component = Component(); + } else { + // Convert to UTF-8. + dest_component->begin = utf8_buffer->length(); + success = ConvertUTF16ToUTF8(&override_source[override_component.begin], + override_component.len, utf8_buffer); + dest_component->len = utf8_buffer->length() - dest_component->begin; + } + } + return success; +} + +} // namespace + +// See the header file for this array's declaration. +const unsigned char kSharedCharTypeTable[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x00 - 0x0f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x10 - 0x1f + 0, // 0x20 ' ' (escape spaces in queries) + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x21 ! + 0, // 0x22 " + 0, // 0x23 # (invalid in query since it marks the ref) + CHAR_QUERY | CHAR_USERINFO, // 0x24 $ + CHAR_QUERY | CHAR_USERINFO, // 0x25 % + CHAR_QUERY | CHAR_USERINFO, // 0x26 & + 0, // 0x27 ' (Try to prevent XSS.) + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x28 ( + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x29 ) + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x2a * + CHAR_QUERY | CHAR_USERINFO, // 0x2b + + CHAR_QUERY | CHAR_USERINFO, // 0x2c , + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x2d - + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x2e . + CHAR_QUERY, // 0x2f / + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x30 0 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x31 1 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x32 2 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x33 3 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x34 4 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x35 5 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x36 6 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x37 7 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_COMPONENT, // 0x38 8 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_COMPONENT, // 0x39 9 + CHAR_QUERY, // 0x3a : + CHAR_QUERY, // 0x3b ; + 0, // 0x3c < (Try to prevent certain types of XSS.) + CHAR_QUERY, // 0x3d = + 0, // 0x3e > (Try to prevent certain types of XSS.) + CHAR_QUERY, // 0x3f ? + CHAR_QUERY, // 0x40 @ + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x41 A + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x42 B + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x43 C + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x44 D + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x45 E + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x46 F + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x47 G + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x48 H + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x49 I + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4a J + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4b K + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4c L + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4d M + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4e N + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4f O + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x50 P + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x51 Q + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x52 R + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x53 S + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x54 T + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x55 U + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x56 V + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x57 W + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x58 X + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x59 Y + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x5a Z + CHAR_QUERY, // 0x5b [ + CHAR_QUERY, // 0x5c '\' + CHAR_QUERY, // 0x5d ] + CHAR_QUERY, // 0x5e ^ + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x5f _ + CHAR_QUERY, // 0x60 ` + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x61 a + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x62 b + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x63 c + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x64 d + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x65 e + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x66 f + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x67 g + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x68 h + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x69 i + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6a j + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6b k + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6c l + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6d m + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6e n + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6f o + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x70 p + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x71 q + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x72 r + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x73 s + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x74 t + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x75 u + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x76 v + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x77 w + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x78 x + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x79 y + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x7a z + CHAR_QUERY, // 0x7b { + CHAR_QUERY, // 0x7c | + CHAR_QUERY, // 0x7d } + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x7e ~ + 0, // 0x7f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xa0 - 0xaf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xb0 - 0xbf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xc0 - 0xcf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xd0 - 0xdf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xe0 - 0xef + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff +}; + +const char kHexCharLookup[0x10] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', +}; + +const char kCharToHexLookup[8] = { + 0, // 0x00 - 0x1f + '0', // 0x20 - 0x3f: digits 0 - 9 are 0x30 - 0x39 + 'A' - 10, // 0x40 - 0x5f: letters A - F are 0x41 - 0x46 + 'a' - 10, // 0x60 - 0x7f: letters a - f are 0x61 - 0x66 + 0, // 0x80 - 0x9F + 0, // 0xA0 - 0xBF + 0, // 0xC0 - 0xDF + 0, // 0xE0 - 0xFF +}; + +const gurl_base::char16 kUnicodeReplacementCharacter = 0xfffd; + +void AppendStringOfType(const char* source, int length, + SharedCharTypes type, + CanonOutput* output) { + DoAppendStringOfType<char, unsigned char>(source, length, type, output); +} + +void AppendStringOfType(const gurl_base::char16* source, int length, + SharedCharTypes type, + CanonOutput* output) { + DoAppendStringOfType<gurl_base::char16, gurl_base::char16>( + source, length, type, output); +} + +bool ReadUTFChar(const char* str, int* begin, int length, + unsigned* code_point_out) { + // This depends on ints and int32s being the same thing. If they're not, it + // will fail to compile. + // TODO(mmenke): This should probably be fixed. + if (!gurl_base::ReadUnicodeCharacter(str, length, begin, code_point_out) || + !gurl_base::IsValidCharacter(*code_point_out)) { + *code_point_out = kUnicodeReplacementCharacter; + return false; + } + return true; +} + +bool ReadUTFChar(const gurl_base::char16* str, int* begin, int length, + unsigned* code_point_out) { + // This depends on ints and int32s being the same thing. If they're not, it + // will fail to compile. + // TODO(mmenke): This should probably be fixed. + if (!gurl_base::ReadUnicodeCharacter(str, length, begin, code_point_out) || + !gurl_base::IsValidCharacter(*code_point_out)) { + *code_point_out = kUnicodeReplacementCharacter; + return false; + } + return true; +} + +void AppendInvalidNarrowString(const char* spec, int begin, int end, + CanonOutput* output) { + DoAppendInvalidNarrowString<char, unsigned char>(spec, begin, end, output); +} + +void AppendInvalidNarrowString(const gurl_base::char16* spec, int begin, int end, + CanonOutput* output) { + DoAppendInvalidNarrowString<gurl_base::char16, gurl_base::char16>( + spec, begin, end, output); +} + +bool ConvertUTF16ToUTF8(const gurl_base::char16* input, int input_len, + CanonOutput* output) { + bool success = true; + for (int i = 0; i < input_len; i++) { + unsigned code_point; + success &= ReadUTFChar(input, &i, input_len, &code_point); + AppendUTF8Value(code_point, output); + } + return success; +} + +bool ConvertUTF8ToUTF16(const char* input, int input_len, + CanonOutputT<gurl_base::char16>* output) { + bool success = true; + for (int i = 0; i < input_len; i++) { + unsigned code_point; + success &= ReadUTFChar(input, &i, input_len, &code_point); + AppendUTF16Value(code_point, output); + } + return success; +} + +void SetupOverrideComponents(const char* base, + const Replacements<char>& repl, + URLComponentSource<char>* source, + Parsed* parsed) { + // Get the source and parsed structures of the things we are replacing. + const URLComponentSource<char>& repl_source = repl.sources(); + const Parsed& repl_parsed = repl.components(); + + DoOverrideComponent(repl_source.scheme, repl_parsed.scheme, + &source->scheme, &parsed->scheme); + DoOverrideComponent(repl_source.username, repl_parsed.username, + &source->username, &parsed->username); + DoOverrideComponent(repl_source.password, repl_parsed.password, + &source->password, &parsed->password); + + // Our host should be empty if not present, so override the default setup. + DoOverrideComponent(repl_source.host, repl_parsed.host, + &source->host, &parsed->host); + if (parsed->host.len == -1) + parsed->host.len = 0; + + DoOverrideComponent(repl_source.port, repl_parsed.port, + &source->port, &parsed->port); + DoOverrideComponent(repl_source.path, repl_parsed.path, + &source->path, &parsed->path); + DoOverrideComponent(repl_source.query, repl_parsed.query, + &source->query, &parsed->query); + DoOverrideComponent(repl_source.ref, repl_parsed.ref, + &source->ref, &parsed->ref); +} + +bool SetupUTF16OverrideComponents(const char* base, + const Replacements<gurl_base::char16>& repl, + CanonOutput* utf8_buffer, + URLComponentSource<char>* source, + Parsed* parsed) { + bool success = true; + + // Get the source and parsed structures of the things we are replacing. + const URLComponentSource<gurl_base::char16>& repl_source = repl.sources(); + const Parsed& repl_parsed = repl.components(); + + success &= PrepareUTF16OverrideComponent( + repl_source.scheme, repl_parsed.scheme, + utf8_buffer, &parsed->scheme); + success &= PrepareUTF16OverrideComponent( + repl_source.username, repl_parsed.username, + utf8_buffer, &parsed->username); + success &= PrepareUTF16OverrideComponent( + repl_source.password, repl_parsed.password, + utf8_buffer, &parsed->password); + success &= PrepareUTF16OverrideComponent( + repl_source.host, repl_parsed.host, + utf8_buffer, &parsed->host); + success &= PrepareUTF16OverrideComponent( + repl_source.port, repl_parsed.port, + utf8_buffer, &parsed->port); + success &= PrepareUTF16OverrideComponent( + repl_source.path, repl_parsed.path, + utf8_buffer, &parsed->path); + success &= PrepareUTF16OverrideComponent( + repl_source.query, repl_parsed.query, + utf8_buffer, &parsed->query); + success &= PrepareUTF16OverrideComponent( + repl_source.ref, repl_parsed.ref, + utf8_buffer, &parsed->ref); + + // PrepareUTF16OverrideComponent will not have set the data pointer since the + // buffer could be resized, invalidating the pointers. We set the data + // pointers for affected components now that the buffer is finalized. + if (repl_source.scheme) source->scheme = utf8_buffer->data(); + if (repl_source.username) source->username = utf8_buffer->data(); + if (repl_source.password) source->password = utf8_buffer->data(); + if (repl_source.host) source->host = utf8_buffer->data(); + if (repl_source.port) source->port = utf8_buffer->data(); + if (repl_source.path) source->path = utf8_buffer->data(); + if (repl_source.query) source->query = utf8_buffer->data(); + if (repl_source.ref) source->ref = utf8_buffer->data(); + + return success; +} + +#ifndef WIN32 + +int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix) { + const char* format_str; + if (radix == 10) + format_str = "%d"; + else if (radix == 16) + format_str = "%x"; + else + return EINVAL; + + int written = snprintf(buffer, size_in_chars, format_str, value); + if (static_cast<size_t>(written) >= size_in_chars) { + // Output was truncated, or written was negative. + return EINVAL; + } + return 0; +} + +int _itow_s(int value, gurl_base::char16* buffer, size_t size_in_chars, int radix) { + if (radix != 10) + return EINVAL; + + // No more than 12 characters will be required for a 32-bit integer. + // Add an extra byte for the terminating null. + char temp[13]; + int written = snprintf(temp, sizeof(temp), "%d", value); + if (static_cast<size_t>(written) >= size_in_chars) { + // Output was truncated, or written was negative. + return EINVAL; + } + + for (int i = 0; i < written; ++i) { + buffer[i] = static_cast<gurl_base::char16>(temp[i]); + } + buffer[written] = '\0'; + return 0; +} + +#endif // !WIN32 + +} // namespace url
diff --git a/url/url_canon_internal.h b/url/url_canon_internal.h new file mode 100644 index 0000000..e0c7567 --- /dev/null +++ b/url/url_canon_internal.h
@@ -0,0 +1,445 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_CANON_INTERNAL_H_ +#define URL_URL_CANON_INTERNAL_H_ + +// This file is intended to be included in another C++ file where the character +// types are defined. This allows us to write mostly generic code, but not have +// template bloat because everything is inlined when anybody calls any of our +// functions. + +#include <stddef.h> +#include <stdlib.h> + +#include "polyfills/base/component_export.h" +#include "polyfills/base/logging.h" +#include "url/url_canon.h" + +namespace url { + +// Character type handling ----------------------------------------------------- + +// Bits that identify different character types. These types identify different +// bits that are set for each 8-bit character in the kSharedCharTypeTable. +enum SharedCharTypes { + // Characters that do not require escaping in queries. Characters that do + // not have this flag will be escaped; see url_canon_query.cc + CHAR_QUERY = 1, + + // Valid in the username/password field. + CHAR_USERINFO = 2, + + // Valid in a IPv4 address (digits plus dot and 'x' for hex). + CHAR_IPV4 = 4, + + // Valid in an ASCII-representation of a hex digit (as in %-escaped). + CHAR_HEX = 8, + + // Valid in an ASCII-representation of a decimal digit. + CHAR_DEC = 16, + + // Valid in an ASCII-representation of an octal digit. + CHAR_OCT = 32, + + // Characters that do not require escaping in encodeURIComponent. Characters + // that do not have this flag will be escaped; see url_util.cc. + CHAR_COMPONENT = 64, +}; + +// This table contains the flags in SharedCharTypes for each 8-bit character. +// Some canonicalization functions have their own specialized lookup table. +// For those with simple requirements, we have collected the flags in one +// place so there are fewer lookup tables to load into the CPU cache. +// +// Using an unsigned char type has a small but measurable performance benefit +// over using a 32-bit number. +extern const unsigned char kSharedCharTypeTable[0x100]; + +// More readable wrappers around the character type lookup table. +inline bool IsCharOfType(unsigned char c, SharedCharTypes type) { + return !!(kSharedCharTypeTable[c] & type); +} +inline bool IsQueryChar(unsigned char c) { + return IsCharOfType(c, CHAR_QUERY); +} +inline bool IsIPv4Char(unsigned char c) { + return IsCharOfType(c, CHAR_IPV4); +} +inline bool IsHexChar(unsigned char c) { + return IsCharOfType(c, CHAR_HEX); +} +inline bool IsComponentChar(unsigned char c) { + return IsCharOfType(c, CHAR_COMPONENT); +} + +// Appends the given string to the output, escaping characters that do not +// match the given |type| in SharedCharTypes. +void AppendStringOfType(const char* source, int length, + SharedCharTypes type, + CanonOutput* output); +void AppendStringOfType(const gurl_base::char16* source, int length, + SharedCharTypes type, + CanonOutput* output); + +// Maps the hex numerical values 0x0 to 0xf to the corresponding ASCII digit +// that will be used to represent it. +COMPONENT_EXPORT(URL) extern const char kHexCharLookup[0x10]; + +// This lookup table allows fast conversion between ASCII hex letters and their +// corresponding numerical value. The 8-bit range is divided up into 8 +// regions of 0x20 characters each. Each of the three character types (numbers, +// uppercase, lowercase) falls into different regions of this range. The table +// contains the amount to subtract from characters in that range to get at +// the corresponding numerical value. +// +// See HexDigitToValue for the lookup. +extern const char kCharToHexLookup[8]; + +// Assumes the input is a valid hex digit! Call IsHexChar before using this. +inline unsigned char HexCharToValue(unsigned char c) { + return c - kCharToHexLookup[c / 0x20]; +} + +// Indicates if the given character is a dot or dot equivalent, returning the +// number of characters taken by it. This will be one for a literal dot, 3 for +// an escaped dot. If the character is not a dot, this will return 0. +template<typename CHAR> +inline int IsDot(const CHAR* spec, int offset, int end) { + if (spec[offset] == '.') { + return 1; + } else if (spec[offset] == '%' && offset + 3 <= end && + spec[offset + 1] == '2' && + (spec[offset + 2] == 'e' || spec[offset + 2] == 'E')) { + // Found "%2e" + return 3; + } + return 0; +} + +// Returns the canonicalized version of the input character according to scheme +// rules. This is implemented alongside the scheme canonicalizer, and is +// required for relative URL resolving to test for scheme equality. +// +// Returns 0 if the input character is not a valid scheme character. +char CanonicalSchemeChar(gurl_base::char16 ch); + +// Write a single character, escaped, to the output. This always escapes: it +// does no checking that thee character requires escaping. +// Escaping makes sense only 8 bit chars, so code works in all cases of +// input parameters (8/16bit). +template<typename UINCHAR, typename OUTCHAR> +inline void AppendEscapedChar(UINCHAR ch, + CanonOutputT<OUTCHAR>* output) { + output->push_back('%'); + output->push_back(kHexCharLookup[(ch >> 4) & 0xf]); + output->push_back(kHexCharLookup[ch & 0xf]); +} + +// The character we'll substitute for undecodable or invalid characters. +extern const gurl_base::char16 kUnicodeReplacementCharacter; + +// UTF-8 functions ------------------------------------------------------------ + +// Reads one character in UTF-8 starting at |*begin| in |str| and places +// the decoded value into |*code_point|. If the character is valid, we will +// return true. If invalid, we'll return false and put the +// kUnicodeReplacementCharacter into |*code_point|. +// +// |*begin| will be updated to point to the last character consumed so it +// can be incremented in a loop and will be ready for the next character. +// (for a single-byte ASCII character, it will not be changed). +COMPONENT_EXPORT(URL) +bool ReadUTFChar(const char* str, + int* begin, + int length, + unsigned* code_point_out); + +// Generic To-UTF-8 converter. This will call the given append method for each +// character that should be appended, with the given output method. Wrappers +// are provided below for escaped and non-escaped versions of this. +// +// The char_value must have already been checked that it's a valid Unicode +// character. +template<class Output, void Appender(unsigned char, Output*)> +inline void DoAppendUTF8(unsigned char_value, Output* output) { + if (char_value <= 0x7f) { + Appender(static_cast<unsigned char>(char_value), output); + } else if (char_value <= 0x7ff) { + // 110xxxxx 10xxxxxx + Appender(static_cast<unsigned char>(0xC0 | (char_value >> 6)), + output); + Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), + output); + } else if (char_value <= 0xffff) { + // 1110xxxx 10xxxxxx 10xxxxxx + Appender(static_cast<unsigned char>(0xe0 | (char_value >> 12)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), + output); + } else if (char_value <= 0x10FFFF) { // Max Unicode code point. + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), + output); + } else { + // Invalid UTF-8 character (>20 bits). + GURL_NOTREACHED(); + } +} + +// Helper used by AppendUTF8Value below. We use an unsigned parameter so there +// are no funny sign problems with the input, but then have to convert it to +// a regular char for appending. +inline void AppendCharToOutput(unsigned char ch, CanonOutput* output) { + output->push_back(static_cast<char>(ch)); +} + +// Writes the given character to the output as UTF-8. This does NO checking +// of the validity of the Unicode characters; the caller should ensure that +// the value it is appending is valid to append. +inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) { + DoAppendUTF8<CanonOutput, AppendCharToOutput>(char_value, output); +} + +// Writes the given character to the output as UTF-8, escaping ALL +// characters (even when they are ASCII). This does NO checking of the +// validity of the Unicode characters; the caller should ensure that the value +// it is appending is valid to append. +inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) { + DoAppendUTF8<CanonOutput, AppendEscapedChar>(char_value, output); +} + +// UTF-16 functions ----------------------------------------------------------- + +// Reads one character in UTF-16 starting at |*begin| in |str| and places +// the decoded value into |*code_point|. If the character is valid, we will +// return true. If invalid, we'll return false and put the +// kUnicodeReplacementCharacter into |*code_point|. +// +// |*begin| will be updated to point to the last character consumed so it +// can be incremented in a loop and will be ready for the next character. +// (for a single-16-bit-word character, it will not be changed). +COMPONENT_EXPORT(URL) +bool ReadUTFChar(const gurl_base::char16* str, + int* begin, + int length, + unsigned* code_point_out); + +// Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method. +inline void AppendUTF16Value(unsigned code_point, + CanonOutputT<gurl_base::char16>* output) { + if (code_point > 0xffff) { + output->push_back(static_cast<gurl_base::char16>((code_point >> 10) + 0xd7c0)); + output->push_back(static_cast<gurl_base::char16>((code_point & 0x3ff) | 0xdc00)); + } else { + output->push_back(static_cast<gurl_base::char16>(code_point)); + } +} + +// Escaping functions --------------------------------------------------------- + +// Writes the given character to the output as UTF-8, escaped. Call this +// function only when the input is wide. Returns true on success. Failure +// means there was some problem with the encoding, we'll still try to +// update the |*begin| pointer and add a placeholder character to the +// output so processing can continue. +// +// We will append the character starting at ch[begin] with the buffer ch +// being |length|. |*begin| will be updated to point to the last character +// consumed (we may consume more than one for UTF-16) so that if called in +// a loop, incrementing the pointer will move to the next character. +// +// Every single output character will be escaped. This means that if you +// give it an ASCII character as input, it will be escaped. Some code uses +// this when it knows that a character is invalid according to its rules +// for validity. If you don't want escaping for ASCII characters, you will +// have to filter them out prior to calling this function. +// +// Assumes that ch[begin] is within range in the array, but does not assume +// that any following characters are. +inline bool AppendUTF8EscapedChar(const gurl_base::char16* str, int* begin, + int length, CanonOutput* output) { + // UTF-16 input. ReadUTFChar will handle invalid characters for us and give + // us the kUnicodeReplacementCharacter, so we don't have to do special + // checking after failure, just pass through the failure to the caller. + unsigned char_value; + bool success = ReadUTFChar(str, begin, length, &char_value); + AppendUTF8EscapedValue(char_value, output); + return success; +} + +// Handles UTF-8 input. See the wide version above for usage. +inline bool AppendUTF8EscapedChar(const char* str, int* begin, int length, + CanonOutput* output) { + // ReadUTF8Char will handle invalid characters for us and give us the + // kUnicodeReplacementCharacter, so we don't have to do special checking + // after failure, just pass through the failure to the caller. + unsigned ch; + bool success = ReadUTFChar(str, begin, length, &ch); + AppendUTF8EscapedValue(ch, output); + return success; +} + +// Given a '%' character at |*begin| in the string |spec|, this will decode +// the escaped value and put it into |*unescaped_value| on success (returns +// true). On failure, this will return false, and will not write into +// |*unescaped_value|. +// +// |*begin| will be updated to point to the last character of the escape +// sequence so that when called with the index of a for loop, the next time +// through it will point to the next character to be considered. On failure, +// |*begin| will be unchanged. +inline bool Is8BitChar(char c) { + return true; // this case is specialized to avoid a warning +} +inline bool Is8BitChar(gurl_base::char16 c) { + return c <= 255; +} + +template<typename CHAR> +inline bool DecodeEscaped(const CHAR* spec, int* begin, int end, + unsigned char* unescaped_value) { + if (*begin + 3 > end || + !Is8BitChar(spec[*begin + 1]) || !Is8BitChar(spec[*begin + 2])) { + // Invalid escape sequence because there's not enough room, or the + // digits are not ASCII. + return false; + } + + unsigned char first = static_cast<unsigned char>(spec[*begin + 1]); + unsigned char second = static_cast<unsigned char>(spec[*begin + 2]); + if (!IsHexChar(first) || !IsHexChar(second)) { + // Invalid hex digits, fail. + return false; + } + + // Valid escape sequence. + *unescaped_value = (HexCharToValue(first) << 4) + HexCharToValue(second); + *begin += 2; + return true; +} + +// Appends the given substring to the output, escaping "some" characters that +// it feels may not be safe. It assumes the input values are all contained in +// 8-bit although it allows any type. +// +// This is used in error cases to append invalid output so that it looks +// approximately correct. Non-error cases should not call this function since +// the escaping rules are not guaranteed! +void AppendInvalidNarrowString(const char* spec, int begin, int end, + CanonOutput* output); +void AppendInvalidNarrowString(const gurl_base::char16* spec, int begin, int end, + CanonOutput* output); + +// Misc canonicalization helpers ---------------------------------------------- + +// Converts between UTF-8 and UTF-16, returning true on successful conversion. +// The output will be appended to the given canonicalizer output (so make sure +// it's empty if you want to replace). +// +// On invalid input, this will still write as much output as possible, +// replacing the invalid characters with the "invalid character". It will +// return false in the failure case, and the caller should not continue as +// normal. +COMPONENT_EXPORT(URL) +bool ConvertUTF16ToUTF8(const gurl_base::char16* input, + int input_len, + CanonOutput* output); +COMPONENT_EXPORT(URL) +bool ConvertUTF8ToUTF16(const char* input, + int input_len, + CanonOutputT<gurl_base::char16>* output); + +// Converts from UTF-16 to 8-bit using the character set converter. If the +// converter is NULL, this will use UTF-8. +void ConvertUTF16ToQueryEncoding(const gurl_base::char16* input, + const Component& query, + CharsetConverter* converter, + CanonOutput* output); + +// Applies the replacements to the given component source. The component source +// should be pre-initialized to the "old" base. That is, all pointers will +// point to the spec of the old URL, and all of the Parsed components will +// be indices into that string. +// +// The pointers and components in the |source| for all non-NULL strings in the +// |repl| (replacements) will be updated to reference those strings. +// Canonicalizing with the new |source| and |parsed| can then combine URL +// components from many different strings. +void SetupOverrideComponents(const char* base, + const Replacements<char>& repl, + URLComponentSource<char>* source, + Parsed* parsed); + +// Like the above 8-bit version, except that it additionally converts the +// UTF-16 input to UTF-8 before doing the overrides. +// +// The given utf8_buffer is used to store the converted components. They will +// be appended one after another, with the parsed structure identifying the +// appropriate substrings. This buffer is a parameter because the source has +// no storage, so the buffer must have the same lifetime as the source +// parameter owned by the caller. +// +// THE CALLER MUST NOT ADD TO THE |utf8_buffer| AFTER THIS CALL. Members of +// |source| will point into this buffer, which could be invalidated if +// additional data is added and the CanonOutput resizes its buffer. +// +// Returns true on success. False means that the input was not valid UTF-16, +// although we will have still done the override with "invalid characters" in +// place of errors. +bool SetupUTF16OverrideComponents(const char* base, + const Replacements<gurl_base::char16>& repl, + CanonOutput* utf8_buffer, + URLComponentSource<char>* source, + Parsed* parsed); + +// Implemented in url_canon_path.cc, these are required by the relative URL +// resolver as well, so we declare them here. +bool CanonicalizePartialPath(const char* spec, + const Component& path, + int path_begin_in_output, + CanonOutput* output); +bool CanonicalizePartialPath(const gurl_base::char16* spec, + const Component& path, + int path_begin_in_output, + CanonOutput* output); + +#ifndef WIN32 + +// Implementations of Windows' int-to-string conversions +COMPONENT_EXPORT(URL) +int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix); +COMPONENT_EXPORT(URL) +int _itow_s(int value, gurl_base::char16* buffer, size_t size_in_chars, int radix); + +// Secure template overloads for these functions +template<size_t N> +inline int _itoa_s(int value, char (&buffer)[N], int radix) { + return _itoa_s(value, buffer, N, radix); +} + +template<size_t N> +inline int _itow_s(int value, gurl_base::char16 (&buffer)[N], int radix) { + return _itow_s(value, buffer, N, radix); +} + +// _strtoui64 and strtoull behave the same +inline unsigned long long _strtoui64(const char* nptr, + char** endptr, int base) { + return strtoull(nptr, endptr, base); +} + +#endif // WIN32 + +} // namespace url + +#endif // URL_URL_CANON_INTERNAL_H_
diff --git a/url/url_canon_internal_file.h b/url/url_canon_internal_file.h new file mode 100644 index 0000000..3b0a81e --- /dev/null +++ b/url/url_canon_internal_file.h
@@ -0,0 +1,135 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_CANON_INTERNAL_FILE_H_ +#define URL_URL_CANON_INTERNAL_FILE_H_ + +// As with url_canon_internal.h, this file is intended to be included in +// another C++ file where the template types are defined. This allows the +// programmer to use this to use these functions for their own strings +// types, without bloating the code by having inline templates used in +// every call site. +// +// *** This file must be included after url_canon_internal as we depend on some +// functions in it. *** + +#include "base/strings/string_util.h" +#include "url/url_file.h" +#include "url/url_parse_internal.h" + +namespace url { + +// Given a pointer into the spec, this copies and canonicalizes the drive +// letter and colon to the output, if one is found. If there is not a drive +// spec, it won't do anything. The index of the next character in the input +// spec is returned (after the colon when a drive spec is found, the begin +// offset if one is not). +template<typename CHAR> +static int FileDoDriveSpec(const CHAR* spec, int begin, int end, + CanonOutput* output) { + // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo, + // (with backslashes instead of slashes as well). + int num_slashes = CountConsecutiveSlashes(spec, begin, end); + int after_slashes = begin + num_slashes; + + if (!DoesBeginWindowsDriveSpec(spec, after_slashes, end)) + return begin; // Haven't consumed any characters + + // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid + // and that it is followed by a colon/pipe. + + // Normalize Windows drive letters to uppercase + if (gurl_base::IsAsciiLower(spec[after_slashes])) + output->push_back(spec[after_slashes] - 'a' + 'A'); + else + output->push_back(static_cast<char>(spec[after_slashes])); + + // Normalize the character following it to a colon rather than pipe. + output->push_back(':'); + output->push_back('/'); + return after_slashes + 2; +} + +// FileDoDriveSpec will have already added the first backslash, so we need to +// write everything following the slashes using the path canonicalizer. +template<typename CHAR, typename UCHAR> +static void FileDoPath(const CHAR* spec, int begin, int end, + CanonOutput* output) { + // Normalize the number of slashes after the drive letter. The path + // canonicalizer expects the input to begin in a slash already so + // doesn't check. We want to handle no-slashes + int num_slashes = CountConsecutiveSlashes(spec, begin, end); + int after_slashes = begin + num_slashes; + + // Now use the regular path canonicalizer to canonicalize the rest of the + // path. We supply it with the path following the slashes. It won't prepend + // a slash because it assumes any nonempty path already starts with one. + // We explicitly filter out calls with no path here to prevent that case. + ParsedComponent sub_path(after_slashes, end - after_slashes); + if (sub_path.len > 0) { + // Give it a fake output component to write into. DoCanonicalizeFile will + // compute the full path component. + ParsedComponent fake_output_path; + URLCanonInternal<CHAR, UCHAR>::DoPath( + spec, sub_path, output, &fake_output_path); + } +} + +template<typename CHAR, typename UCHAR> +static bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source, + const ParsedURL& parsed, + CanonOutput* output, + ParsedURL* new_parsed) { + // Things we don't set in file: URLs. + new_parsed->username = ParsedComponent(0, -1); + new_parsed->password = ParsedComponent(0, -1); + new_parsed->port = ParsedComponent(0, -1); + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->push_back('f'); + output->push_back('i'); + output->push_back('l'); + output->push_back('e'); + new_parsed->scheme.len = output->length() - new_parsed->scheme.begin; + output->push_back(':'); + + // Write the separator for the host. + output->push_back('/'); + output->push_back('/'); + + // Append the host. For many file URLs, this will be empty. For UNC, this + // will be present. + // TODO(brettw) This doesn't do any checking for host name validity. We + // should probably handle validity checking of UNC hosts differently than + // for regular IP hosts. + bool success = URLCanonInternal<CHAR, UCHAR>::DoHost( + source.host, parsed.host, output, &new_parsed->host); + + // Write a separator for the start of the path. We'll ignore any slashes + // already at the beginning of the path. + new_parsed->path.begin = output->length(); + output->push_back('/'); + + // Copy and normalize the "c:" at the beginning, if present. + int after_drive = FileDoDriveSpec(source.path, parsed.path.begin, + parsed.path.end(), output); + + // Copy the rest of the path. + FileDoPath<CHAR, UCHAR>(source.path, after_drive, parsed.path.end(), output); + new_parsed->path.len = output->length() - new_parsed->path.begin; + + // For things following the path, we can use the standard canonicalizers. + success &= URLCanonInternal<CHAR, UCHAR>::DoQuery( + source.query, parsed.query, output, &new_parsed->query); + success &= URLCanonInternal<CHAR, UCHAR>::DoRef( + source.ref, parsed.ref, output, &new_parsed->ref); + + return success; +} + +} // namespace url + +#endif // URL_URL_CANON_INTERNAL_FILE_H_
diff --git a/url/url_canon_ip.cc b/url/url_canon_ip.cc new file mode 100644 index 0000000..f7c5700 --- /dev/null +++ b/url/url_canon_ip.cc
@@ -0,0 +1,711 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_canon_ip.h" + +#include <stdint.h> +#include <stdlib.h> +#include <limits> + +#include "polyfills/base/logging.h" +#include "url/url_canon_internal.h" + +namespace url { + +namespace { + +// Converts one of the character types that represent a numerical base to the +// corresponding base. +int BaseForType(SharedCharTypes type) { + switch (type) { + case CHAR_HEX: + return 16; + case CHAR_DEC: + return 10; + case CHAR_OCT: + return 8; + default: + return 0; + } +} + +template<typename CHAR, typename UCHAR> +bool DoFindIPv4Components(const CHAR* spec, + const Component& host, + Component components[4]) { + if (!host.is_nonempty()) + return false; + + int cur_component = 0; // Index of the component we're working on. + int cur_component_begin = host.begin; // Start of the current component. + int end = host.end(); + for (int i = host.begin; /* nothing */; i++) { + if (i >= end || spec[i] == '.') { + // Found the end of the current component. + int component_len = i - cur_component_begin; + components[cur_component] = Component(cur_component_begin, component_len); + + // The next component starts after the dot. + cur_component_begin = i + 1; + cur_component++; + + // Don't allow empty components (two dots in a row), except we may + // allow an empty component at the end (this would indicate that the + // input ends in a dot). We also want to error if the component is + // empty and it's the only component (cur_component == 1). + if (component_len == 0 && (i < end || cur_component == 1)) + return false; + + if (i >= end) + break; // End of the input. + + if (cur_component == 4) { + // Anything else after the 4th component is an error unless it is a + // dot that would otherwise be treated as the end of input. + if (spec[i] == '.' && i + 1 == end) + break; + return false; + } + } else if (static_cast<UCHAR>(spec[i]) >= 0x80 || + !IsIPv4Char(static_cast<unsigned char>(spec[i]))) { + // Invalid character for an IPv4 address. + return false; + } + } + + // Fill in any unused components. + while (cur_component < 4) + components[cur_component++] = Component(); + return true; +} + +// Converts an IPv4 component to a 32-bit number, while checking for overflow. +// +// Possible return values: +// - IPV4 - The number was valid, and did not overflow. +// - BROKEN - The input was numeric, but too large for a 32-bit field. +// - NEUTRAL - Input was not numeric. +// +// The input is assumed to be ASCII. FindIPv4Components should have stripped +// out any input that is greater than 7 bits. The components are assumed +// to be non-empty. +template<typename CHAR> +CanonHostInfo::Family IPv4ComponentToNumber(const CHAR* spec, + const Component& component, + uint32_t* number) { + // Figure out the base + SharedCharTypes base; + int base_prefix_len = 0; // Size of the prefix for this base. + if (spec[component.begin] == '0') { + // Either hex or dec, or a standalone zero. + if (component.len == 1) { + base = CHAR_DEC; + } else if (spec[component.begin + 1] == 'X' || + spec[component.begin + 1] == 'x') { + base = CHAR_HEX; + base_prefix_len = 2; + } else { + base = CHAR_OCT; + base_prefix_len = 1; + } + } else { + base = CHAR_DEC; + } + + // Extend the prefix to consume all leading zeros. + while (base_prefix_len < component.len && + spec[component.begin + base_prefix_len] == '0') + base_prefix_len++; + + // Put the component, minus any base prefix, into a NULL-terminated buffer so + // we can call the standard library. Because leading zeros have already been + // discarded, filling the entire buffer is guaranteed to trigger the 32-bit + // overflow check. + const int kMaxComponentLen = 16; + char buf[kMaxComponentLen + 1]; // digits + '\0' + int dest_i = 0; + for (int i = component.begin + base_prefix_len; i < component.end(); i++) { + // We know the input is 7-bit, so convert to narrow (if this is the wide + // version of the template) by casting. + char input = static_cast<char>(spec[i]); + + // Validate that this character is OK for the given base. + if (!IsCharOfType(input, base)) + return CanonHostInfo::NEUTRAL; + + // Fill the buffer, if there's space remaining. This check allows us to + // verify that all characters are numeric, even those that don't fit. + if (dest_i < kMaxComponentLen) + buf[dest_i++] = input; + } + + buf[dest_i] = '\0'; + + // Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal + // number can overflow a 64-bit number in <= 16 characters). + uint64_t num = _strtoui64(buf, NULL, BaseForType(base)); + + // Check for 32-bit overflow. + if (num > std::numeric_limits<uint32_t>::max()) + return CanonHostInfo::BROKEN; + + // No overflow. Success! + *number = static_cast<uint32_t>(num); + return CanonHostInfo::IPV4; +} + +// See declaration of IPv4AddressToNumber for documentation. +template<typename CHAR> +CanonHostInfo::Family DoIPv4AddressToNumber(const CHAR* spec, + const Component& host, + unsigned char address[4], + int* num_ipv4_components) { + // The identified components. Not all may exist. + Component components[4]; + if (!FindIPv4Components(spec, host, components)) + return CanonHostInfo::NEUTRAL; + + // Convert existing components to digits. Values up to + // |existing_components| will be valid. + uint32_t component_values[4]; + int existing_components = 0; + + // Set to true if one or more components are BROKEN. BROKEN is only + // returned if all components are IPV4 or BROKEN, so, for example, + // 12345678912345.de returns NEUTRAL rather than broken. + bool broken = false; + for (int i = 0; i < 4; i++) { + if (components[i].len <= 0) + continue; + CanonHostInfo::Family family = IPv4ComponentToNumber( + spec, components[i], &component_values[existing_components]); + + if (family == CanonHostInfo::BROKEN) { + broken = true; + } else if (family != CanonHostInfo::IPV4) { + // Stop if we hit a non-BROKEN invalid non-empty component. + return family; + } + + existing_components++; + } + + if (broken) + return CanonHostInfo::BROKEN; + + // Use that sequence of numbers to fill out the 4-component IP address. + + // First, process all components but the last, while making sure each fits + // within an 8-bit field. + for (int i = 0; i < existing_components - 1; i++) { + if (component_values[i] > std::numeric_limits<uint8_t>::max()) + return CanonHostInfo::BROKEN; + address[i] = static_cast<unsigned char>(component_values[i]); + } + + // Next, consume the last component to fill in the remaining bytes. + // Work around a gcc 4.9 bug. crbug.com/392872 +#if ((__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" +#endif + uint32_t last_value = component_values[existing_components - 1]; +#if ((__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4) +#pragma GCC diagnostic pop +#endif + for (int i = 3; i >= existing_components - 1; i--) { + address[i] = static_cast<unsigned char>(last_value); + last_value >>= 8; + } + + // If the last component has residual bits, report overflow. + if (last_value != 0) + return CanonHostInfo::BROKEN; + + // Tell the caller how many components we saw. + *num_ipv4_components = existing_components; + + // Success! + return CanonHostInfo::IPV4; +} + +// Return true if we've made a final IPV4/BROKEN decision, false if the result +// is NEUTRAL, and we could use a second opinion. +template<typename CHAR, typename UCHAR> +bool DoCanonicalizeIPv4Address(const CHAR* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + host_info->family = IPv4AddressToNumber( + spec, host, host_info->address, &host_info->num_ipv4_components); + + switch (host_info->family) { + case CanonHostInfo::IPV4: + // Definitely an IPv4 address. + host_info->out_host.begin = output->length(); + AppendIPv4Address(host_info->address, output); + host_info->out_host.len = output->length() - host_info->out_host.begin; + return true; + case CanonHostInfo::BROKEN: + // Definitely broken. + return true; + default: + // Could be IPv6 or a hostname. + return false; + } +} + +// Helper class that describes the main components of an IPv6 input string. +// See the following examples to understand how it breaks up an input string: +// +// [Example 1]: input = "[::aa:bb]" +// ==> num_hex_components = 2 +// ==> hex_components[0] = Component(3,2) "aa" +// ==> hex_components[1] = Component(6,2) "bb" +// ==> index_of_contraction = 0 +// ==> ipv4_component = Component(0, -1) +// +// [Example 2]: input = "[1:2::3:4:5]" +// ==> num_hex_components = 5 +// ==> hex_components[0] = Component(1,1) "1" +// ==> hex_components[1] = Component(3,1) "2" +// ==> hex_components[2] = Component(6,1) "3" +// ==> hex_components[3] = Component(8,1) "4" +// ==> hex_components[4] = Component(10,1) "5" +// ==> index_of_contraction = 2 +// ==> ipv4_component = Component(0, -1) +// +// [Example 3]: input = "[::ffff:192.168.0.1]" +// ==> num_hex_components = 1 +// ==> hex_components[0] = Component(3,4) "ffff" +// ==> index_of_contraction = 0 +// ==> ipv4_component = Component(8, 11) "192.168.0.1" +// +// [Example 4]: input = "[1::]" +// ==> num_hex_components = 1 +// ==> hex_components[0] = Component(1,1) "1" +// ==> index_of_contraction = 1 +// ==> ipv4_component = Component(0, -1) +// +// [Example 5]: input = "[::192.168.0.1]" +// ==> num_hex_components = 0 +// ==> index_of_contraction = 0 +// ==> ipv4_component = Component(8, 11) "192.168.0.1" +// +struct IPv6Parsed { + // Zero-out the parse information. + void reset() { + num_hex_components = 0; + index_of_contraction = -1; + ipv4_component.reset(); + } + + // There can be up to 8 hex components (colon separated) in the literal. + Component hex_components[8]; + + // The count of hex components present. Ranges from [0,8]. + int num_hex_components; + + // The index of the hex component that the "::" contraction precedes, or + // -1 if there is no contraction. + int index_of_contraction; + + // The range of characters which are an IPv4 literal. + Component ipv4_component; +}; + +// Parse the IPv6 input string. If parsing succeeded returns true and fills +// |parsed| with the information. If parsing failed (because the input is +// invalid) returns false. +template<typename CHAR, typename UCHAR> +bool DoParseIPv6(const CHAR* spec, const Component& host, IPv6Parsed* parsed) { + // Zero-out the info. + parsed->reset(); + + if (!host.is_nonempty()) + return false; + + // The index for start and end of address range (no brackets). + int begin = host.begin; + int end = host.end(); + + int cur_component_begin = begin; // Start of the current component. + + // Scan through the input, searching for hex components, "::" contractions, + // and IPv4 components. + for (int i = begin; /* i <= end */; i++) { + bool is_colon = spec[i] == ':'; + bool is_contraction = is_colon && i < end - 1 && spec[i + 1] == ':'; + + // We reached the end of the current component if we encounter a colon + // (separator between hex components, or start of a contraction), or end of + // input. + if (is_colon || i == end) { + int component_len = i - cur_component_begin; + + // A component should not have more than 4 hex digits. + if (component_len > 4) + return false; + + // Don't allow empty components. + if (component_len == 0) { + // The exception is when contractions appear at beginning of the + // input or at the end of the input. + if (!((is_contraction && i == begin) || (i == end && + parsed->index_of_contraction == parsed->num_hex_components))) + return false; + } + + // Add the hex component we just found to running list. + if (component_len > 0) { + // Can't have more than 8 components! + if (parsed->num_hex_components >= 8) + return false; + + parsed->hex_components[parsed->num_hex_components++] = + Component(cur_component_begin, component_len); + } + } + + if (i == end) + break; // Reached the end of the input, DONE. + + // We found a "::" contraction. + if (is_contraction) { + // There can be at most one contraction in the literal. + if (parsed->index_of_contraction != -1) + return false; + parsed->index_of_contraction = parsed->num_hex_components; + ++i; // Consume the colon we peeked. + } + + if (is_colon) { + // Colons are separators between components, keep track of where the + // current component started (after this colon). + cur_component_begin = i + 1; + } else { + if (static_cast<UCHAR>(spec[i]) >= 0x80) + return false; // Not ASCII. + + if (!IsHexChar(static_cast<unsigned char>(spec[i]))) { + // Regular components are hex numbers. It is also possible for + // a component to be an IPv4 address in dotted form. + if (IsIPv4Char(static_cast<unsigned char>(spec[i]))) { + // Since IPv4 address can only appear at the end, assume the rest + // of the string is an IPv4 address. (We will parse this separately + // later). + parsed->ipv4_component = + Component(cur_component_begin, end - cur_component_begin); + break; + } else { + // The character was neither a hex digit, nor an IPv4 character. + return false; + } + } + } + } + + return true; +} + +// Verifies the parsed IPv6 information, checking that the various components +// add up to the right number of bits (hex components are 16 bits, while +// embedded IPv4 formats are 32 bits, and contractions are placeholdes for +// 16 or more bits). Returns true if sizes match up, false otherwise. On +// success writes the length of the contraction (if any) to +// |out_num_bytes_of_contraction|. +bool CheckIPv6ComponentsSize(const IPv6Parsed& parsed, + int* out_num_bytes_of_contraction) { + // Each group of four hex digits contributes 16 bits. + int num_bytes_without_contraction = parsed.num_hex_components * 2; + + // If an IPv4 address was embedded at the end, it contributes 32 bits. + if (parsed.ipv4_component.is_valid()) + num_bytes_without_contraction += 4; + + // If there was a "::" contraction, its size is going to be: + // MAX([16bits], [128bits] - num_bytes_without_contraction). + int num_bytes_of_contraction = 0; + if (parsed.index_of_contraction != -1) { + num_bytes_of_contraction = 16 - num_bytes_without_contraction; + if (num_bytes_of_contraction < 2) + num_bytes_of_contraction = 2; + } + + // Check that the numbers add up. + if (num_bytes_without_contraction + num_bytes_of_contraction != 16) + return false; + + *out_num_bytes_of_contraction = num_bytes_of_contraction; + return true; +} + +// Converts a hex component into a number. This cannot fail since the caller has +// already verified that each character in the string was a hex digit, and +// that there were no more than 4 characters. +template <typename CHAR> +uint16_t IPv6HexComponentToNumber(const CHAR* spec, + const Component& component) { + GURL_DCHECK(component.len <= 4); + + // Copy the hex string into a C-string. + char buf[5]; + for (int i = 0; i < component.len; ++i) + buf[i] = static_cast<char>(spec[component.begin + i]); + buf[component.len] = '\0'; + + // Convert it to a number (overflow is not possible, since with 4 hex + // characters we can at most have a 16 bit number). + return static_cast<uint16_t>(_strtoui64(buf, NULL, 16)); +} + +// Converts an IPv6 address to a 128-bit number (network byte order), returning +// true on success. False means that the input was not a valid IPv6 address. +template<typename CHAR, typename UCHAR> +bool DoIPv6AddressToNumber(const CHAR* spec, + const Component& host, + unsigned char address[16]) { + // Make sure the component is bounded by '[' and ']'. + int end = host.end(); + if (!host.is_nonempty() || spec[host.begin] != '[' || spec[end - 1] != ']') + return false; + + // Exclude the square brackets. + Component ipv6_comp(host.begin + 1, host.len - 2); + + // Parse the IPv6 address -- identify where all the colon separated hex + // components are, the "::" contraction, and the embedded IPv4 address. + IPv6Parsed ipv6_parsed; + if (!DoParseIPv6<CHAR, UCHAR>(spec, ipv6_comp, &ipv6_parsed)) + return false; + + // Do some basic size checks to make sure that the address doesn't + // specify more than 128 bits or fewer than 128 bits. This also resolves + // how may zero bytes the "::" contraction represents. + int num_bytes_of_contraction; + if (!CheckIPv6ComponentsSize(ipv6_parsed, &num_bytes_of_contraction)) + return false; + + int cur_index_in_address = 0; + + // Loop through each hex components, and contraction in order. + for (int i = 0; i <= ipv6_parsed.num_hex_components; ++i) { + // Append the contraction if it appears before this component. + if (i == ipv6_parsed.index_of_contraction) { + for (int j = 0; j < num_bytes_of_contraction; ++j) + address[cur_index_in_address++] = 0; + } + // Append the hex component's value. + if (i != ipv6_parsed.num_hex_components) { + // Get the 16-bit value for this hex component. + uint16_t number = IPv6HexComponentToNumber<CHAR>( + spec, ipv6_parsed.hex_components[i]); + // Append to |address|, in network byte order. + address[cur_index_in_address++] = (number & 0xFF00) >> 8; + address[cur_index_in_address++] = (number & 0x00FF); + } + } + + // If there was an IPv4 section, convert it into a 32-bit number and append + // it to |address|. + if (ipv6_parsed.ipv4_component.is_valid()) { + // Append the 32-bit number to |address|. + int ignored_num_ipv4_components; + if (CanonHostInfo::IPV4 != + IPv4AddressToNumber(spec, + ipv6_parsed.ipv4_component, + &address[cur_index_in_address], + &ignored_num_ipv4_components)) + return false; + } + + return true; +} + +// Searches for the longest sequence of zeros in |address|, and writes the +// range into |contraction_range|. The run of zeros must be at least 16 bits, +// and if there is a tie the first is chosen. +void ChooseIPv6ContractionRange(const unsigned char address[16], + Component* contraction_range) { + // The longest run of zeros in |address| seen so far. + Component max_range; + + // The current run of zeros in |address| being iterated over. + Component cur_range; + + for (int i = 0; i < 16; i += 2) { + // Test for 16 bits worth of zero. + bool is_zero = (address[i] == 0 && address[i + 1] == 0); + + if (is_zero) { + // Add the zero to the current range (or start a new one). + if (!cur_range.is_valid()) + cur_range = Component(i, 0); + cur_range.len += 2; + } + + if (!is_zero || i == 14) { + // Just completed a run of zeros. If the run is greater than 16 bits, + // it is a candidate for the contraction. + if (cur_range.len > 2 && cur_range.len > max_range.len) { + max_range = cur_range; + } + cur_range.reset(); + } + } + *contraction_range = max_range; +} + +// Return true if we've made a final IPV6/BROKEN decision, false if the result +// is NEUTRAL, and we could use a second opinion. +template<typename CHAR, typename UCHAR> +bool DoCanonicalizeIPv6Address(const CHAR* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + // Turn the IP address into a 128 bit number. + if (!IPv6AddressToNumber(spec, host, host_info->address)) { + // If it's not an IPv6 address, scan for characters that should *only* + // exist in an IPv6 address. + for (int i = host.begin; i < host.end(); i++) { + switch (spec[i]) { + case '[': + case ']': + case ':': + host_info->family = CanonHostInfo::BROKEN; + return true; + } + } + + // No invalid characters. Could still be IPv4 or a hostname. + host_info->family = CanonHostInfo::NEUTRAL; + return false; + } + + host_info->out_host.begin = output->length(); + output->push_back('['); + AppendIPv6Address(host_info->address, output); + output->push_back(']'); + host_info->out_host.len = output->length() - host_info->out_host.begin; + + host_info->family = CanonHostInfo::IPV6; + return true; +} + +} // namespace + +void AppendIPv4Address(const unsigned char address[4], CanonOutput* output) { + for (int i = 0; i < 4; i++) { + char str[16]; + _itoa_s(address[i], str, 10); + + for (int ch = 0; str[ch] != 0; ch++) + output->push_back(str[ch]); + + if (i != 3) + output->push_back('.'); + } +} + +void AppendIPv6Address(const unsigned char address[16], CanonOutput* output) { + // We will output the address according to the rules in: + // http://tools.ietf.org/html/draft-kawamura-ipv6-text-representation-01#section-4 + + // Start by finding where to place the "::" contraction (if any). + Component contraction_range; + ChooseIPv6ContractionRange(address, &contraction_range); + + for (int i = 0; i <= 14;) { + // We check 2 bytes at a time, from bytes (0, 1) to (14, 15), inclusive. + GURL_DCHECK(i % 2 == 0); + if (i == contraction_range.begin && contraction_range.len > 0) { + // Jump over the contraction. + if (i == 0) + output->push_back(':'); + output->push_back(':'); + i = contraction_range.end(); + } else { + // Consume the next 16 bits from |address|. + int x = address[i] << 8 | address[i + 1]; + + i += 2; + + // Stringify the 16 bit number (at most requires 4 hex digits). + char str[5]; + _itoa_s(x, str, 16); + for (int ch = 0; str[ch] != 0; ++ch) + output->push_back(str[ch]); + + // Put a colon after each number, except the last. + if (i < 16) + output->push_back(':'); + } + } +} + +bool FindIPv4Components(const char* spec, + const Component& host, + Component components[4]) { + return DoFindIPv4Components<char, unsigned char>(spec, host, components); +} + +bool FindIPv4Components(const gurl_base::char16* spec, + const Component& host, + Component components[4]) { + return DoFindIPv4Components<gurl_base::char16, gurl_base::char16>( + spec, host, components); +} + +void CanonicalizeIPAddress(const char* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + if (DoCanonicalizeIPv4Address<char, unsigned char>( + spec, host, output, host_info)) + return; + if (DoCanonicalizeIPv6Address<char, unsigned char>( + spec, host, output, host_info)) + return; +} + +void CanonicalizeIPAddress(const gurl_base::char16* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + if (DoCanonicalizeIPv4Address<gurl_base::char16, gurl_base::char16>( + spec, host, output, host_info)) + return; + if (DoCanonicalizeIPv6Address<gurl_base::char16, gurl_base::char16>( + spec, host, output, host_info)) + return; +} + +CanonHostInfo::Family IPv4AddressToNumber(const char* spec, + const Component& host, + unsigned char address[4], + int* num_ipv4_components) { + return DoIPv4AddressToNumber<char>(spec, host, address, num_ipv4_components); +} + +CanonHostInfo::Family IPv4AddressToNumber(const gurl_base::char16* spec, + const Component& host, + unsigned char address[4], + int* num_ipv4_components) { + return DoIPv4AddressToNumber<gurl_base::char16>( + spec, host, address, num_ipv4_components); +} + +bool IPv6AddressToNumber(const char* spec, + const Component& host, + unsigned char address[16]) { + return DoIPv6AddressToNumber<char, unsigned char>(spec, host, address); +} + +bool IPv6AddressToNumber(const gurl_base::char16* spec, + const Component& host, + unsigned char address[16]) { + return DoIPv6AddressToNumber<gurl_base::char16, gurl_base::char16>(spec, host, address); +} + +} // namespace url
diff --git a/url/url_canon_ip.h b/url/url_canon_ip.h new file mode 100644 index 0000000..5d93f28 --- /dev/null +++ b/url/url_canon_ip.h
@@ -0,0 +1,88 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_CANON_IP_H_ +#define URL_URL_CANON_IP_H_ + +#include "polyfills/base/component_export.h" +#include "base/strings/string16.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon.h" + +namespace url { + +// Writes the given IPv4 address to |output|. +COMPONENT_EXPORT(URL) +void AppendIPv4Address(const unsigned char address[4], CanonOutput* output); + +// Writes the given IPv6 address to |output|. +COMPONENT_EXPORT(URL) +void AppendIPv6Address(const unsigned char address[16], CanonOutput* output); + +// Searches the host name for the portions of the IPv4 address. On success, +// each component will be placed into |components| and it will return true. +// It will return false if the host can not be separated as an IPv4 address +// or if there are any non-7-bit characters or other characters that can not +// be in an IP address. (This is important so we fail as early as possible for +// common non-IP hostnames.) +// +// Not all components may exist. If there are only 3 components, for example, +// the last one will have a length of -1 or 0 to indicate it does not exist. +// +// Note that many platforms' inet_addr will ignore everything after a space +// in certain circumstances if the stuff before the space looks like an IP +// address. IE6 is included in this. We do NOT handle this case. In many cases, +// the browser's canonicalization will get run before this which converts +// spaces to %20 (in the case of IE7) or rejects them (in the case of Mozilla), +// so this code path never gets hit. Our host canonicalization will notice +// these spaces and escape them, which will make IP address finding fail. This +// seems like better behavior than stripping after a space. +COMPONENT_EXPORT(URL) +bool FindIPv4Components(const char* spec, + const Component& host, + Component components[4]); +COMPONENT_EXPORT(URL) +bool FindIPv4Components(const gurl_base::char16* spec, + const Component& host, + Component components[4]); + +// Converts an IPv4 address to a 32-bit number (network byte order). +// +// Possible return values: +// IPV4 - IPv4 address was successfully parsed. +// BROKEN - Input was formatted like an IPv4 address, but overflow occurred +// during parsing. +// NEUTRAL - Input couldn't possibly be interpreted as an IPv4 address. +// It might be an IPv6 address, or a hostname. +// +// On success, |num_ipv4_components| will be populated with the number of +// components in the IPv4 address. +COMPONENT_EXPORT(URL) +CanonHostInfo::Family IPv4AddressToNumber(const char* spec, + const Component& host, + unsigned char address[4], + int* num_ipv4_components); +COMPONENT_EXPORT(URL) +CanonHostInfo::Family IPv4AddressToNumber(const gurl_base::char16* spec, + const Component& host, + unsigned char address[4], + int* num_ipv4_components); + +// Converts an IPv6 address to a 128-bit number (network byte order), returning +// true on success. False means that the input was not a valid IPv6 address. +// +// NOTE that |host| is expected to be surrounded by square brackets. +// i.e. "[::1]" rather than "::1". +COMPONENT_EXPORT(URL) +bool IPv6AddressToNumber(const char* spec, + const Component& host, + unsigned char address[16]); +COMPONENT_EXPORT(URL) +bool IPv6AddressToNumber(const gurl_base::char16* spec, + const Component& host, + unsigned char address[16]); + +} // namespace url + +#endif // URL_URL_CANON_IP_H_
diff --git a/url/url_canon_mailtourl.cc b/url/url_canon_mailtourl.cc new file mode 100644 index 0000000..f09faa7 --- /dev/null +++ b/url/url_canon_mailtourl.cc
@@ -0,0 +1,127 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Functions for canonicalizing "mailto:" URLs. + +#include "url/url_canon.h" +#include "url/url_canon_internal.h" +#include "url/url_file.h" +#include "url/url_parse_internal.h" + +namespace url { + +namespace { + +// Certain characters should be percent-encoded when they appear in the path +// component of a mailto URL, to improve compatibility and mitigate against +// command-injection attacks on mailto handlers. See https://crbug.com/711020. +template <typename UCHAR> +bool ShouldEncodeMailboxCharacter(UCHAR uch) { + if (uch < 0x21 || // space & control characters. + uch > 0x7e || // high-ascii characters. + uch == 0x22 || // quote. + uch == 0x3c || uch == 0x3e || // angle brackets. + uch == 0x60 || // backtick. + uch == 0x7b || uch == 0x7c || uch == 0x7d // braces and pipe. + ) { + return true; + } + return false; +} + +template <typename CHAR, typename UCHAR> +bool DoCanonicalizeMailtoURL(const URLComponentSource<CHAR>& source, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed) { + // mailto: only uses {scheme, path, query} -- clear the rest. + new_parsed->username = Component(); + new_parsed->password = Component(); + new_parsed->host = Component(); + new_parsed->port = Component(); + new_parsed->ref = Component(); + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->Append("mailto:", 7); + new_parsed->scheme.len = 6; + + bool success = true; + + // Path + if (parsed.path.is_valid()) { + new_parsed->path.begin = output->length(); + + // Copy the path using path URL's more lax escaping rules. + // We convert to UTF-8 and escape non-ASCII, but leave most + // ASCII characters alone. + int end = parsed.path.end(); + for (int i = parsed.path.begin; i < end; ++i) { + UCHAR uch = static_cast<UCHAR>(source.path[i]); + if (ShouldEncodeMailboxCharacter<UCHAR>(uch)) + success &= AppendUTF8EscapedChar(source.path, &i, end, output); + else + output->push_back(static_cast<char>(uch)); + } + + new_parsed->path.len = output->length() - new_parsed->path.begin; + } else { + // No path at all + new_parsed->path.reset(); + } + + // Query -- always use the default UTF8 charset converter. + CanonicalizeQuery(source.query, parsed.query, NULL, + output, &new_parsed->query); + + return success; +} + +} // namespace + +bool CanonicalizeMailtoURL(const char* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeMailtoURL<char, unsigned char>( + URLComponentSource<char>(spec), parsed, output, new_parsed); +} + +bool CanonicalizeMailtoURL(const gurl_base::char16* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeMailtoURL<gurl_base::char16, gurl_base::char16>( + URLComponentSource<gurl_base::char16>(spec), parsed, output, new_parsed); +} + +bool ReplaceMailtoURL(const char* base, + const Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + Parsed* new_parsed) { + URLComponentSource<char> source(base); + Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizeMailtoURL<char, unsigned char>( + source, parsed, output, new_parsed); +} + +bool ReplaceMailtoURL(const char* base, + const Parsed& base_parsed, + const Replacements<gurl_base::char16>& replacements, + CanonOutput* output, + Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource<char> source(base); + Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizeMailtoURL<char, unsigned char>( + source, parsed, output, new_parsed); +} + +} // namespace url
diff --git a/url/url_canon_path.cc b/url/url_canon_path.cc new file mode 100644 index 0000000..ee18aa2 --- /dev/null +++ b/url/url_canon_path.cc
@@ -0,0 +1,437 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <limits.h> + +#include "polyfills/base/logging.h" +#include "url/url_canon.h" +#include "url/url_canon_internal.h" +#include "url/url_parse_internal.h" + +namespace url { + +namespace { + +enum CharacterFlags { + // Pass through unchanged, whether escaped or unescaped. This doesn't + // actually set anything so you can't OR it to check, it's just to make the + // table below more clear when neither ESCAPE or UNESCAPE is set. + PASS = 0, + + // This character requires special handling in DoPartialPath. Doing this test + // first allows us to filter out the common cases of regular characters that + // can be directly copied. + SPECIAL = 1, + + // This character must be escaped in the canonical output. Note that all + // escaped chars also have the "special" bit set so that the code that looks + // for this is triggered. Not valid with PASS or ESCAPE + ESCAPE_BIT = 2, + ESCAPE = ESCAPE_BIT | SPECIAL, + + // This character must be unescaped in canonical output. Not valid with + // ESCAPE or PASS. We DON'T set the SPECIAL flag since if we encounter these + // characters unescaped, they should just be copied. + UNESCAPE = 4, + + // This character is disallowed in URLs. Note that the "special" bit is also + // set to trigger handling. + INVALID_BIT = 8, + INVALID = INVALID_BIT | SPECIAL, +}; + +// This table contains one of the above flag values. Note some flags are more +// than one bits because they also turn on the "special" flag. Special is the +// only flag that may be combined with others. +// +// This table is designed to match exactly what IE does with the characters. +// +// Dot is even more special, and the escaped version is handled specially by +// IsDot. Therefore, we don't need the "escape" flag, and even the "unescape" +// bit is never handled (we just need the "special") bit. +const unsigned char kPathCharLookup[0x100] = { +// NULL control chars... + INVALID, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, +// control chars... + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, +// ' ' ! " # $ % & ' ( ) * + , - . / + ESCAPE, PASS, ESCAPE, ESCAPE, PASS, ESCAPE, PASS, PASS, PASS, PASS, PASS, PASS, PASS, UNESCAPE,SPECIAL, PASS, +// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, PASS, ESCAPE, PASS, ESCAPE, ESCAPE, +// @ A B C D E F G H I J K L M N O + PASS, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE, +// P Q R S T U V W X Y Z [ \ ] ^ _ + UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, ESCAPE, PASS, ESCAPE, UNESCAPE, +// ` a b c d e f g h i j k l m n o + ESCAPE, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE, +// p q r s t u v w x y z { | } ~ <NBSP> + UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,ESCAPE, ESCAPE, ESCAPE, UNESCAPE,ESCAPE, +// ...all the high-bit characters are escaped + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE}; + +enum DotDisposition { + // The given dot is just part of a filename and is not special. + NOT_A_DIRECTORY, + + // The given dot is the current directory. + DIRECTORY_CUR, + + // The given dot is the first of a double dot that should take us up one. + DIRECTORY_UP +}; + +// When the path resolver finds a dot, this function is called with the +// character following that dot to see what it is. The return value +// indicates what type this dot is (see above). This code handles the case +// where the dot is at the end of the input. +// +// |*consumed_len| will contain the number of characters in the input that +// express what we found. +// +// If the input is "../foo", |after_dot| = 1, |end| = 6, and +// at the end, |*consumed_len| = 2 for the "./" this function consumed. The +// original dot length should be handled by the caller. +template<typename CHAR> +DotDisposition ClassifyAfterDot(const CHAR* spec, int after_dot, + int end, int* consumed_len) { + if (after_dot == end) { + // Single dot at the end. + *consumed_len = 0; + return DIRECTORY_CUR; + } + if (IsURLSlash(spec[after_dot])) { + // Single dot followed by a slash. + *consumed_len = 1; // Consume the slash + return DIRECTORY_CUR; + } + + int second_dot_len = IsDot(spec, after_dot, end); + if (second_dot_len) { + int after_second_dot = after_dot + second_dot_len; + if (after_second_dot == end) { + // Double dot at the end. + *consumed_len = second_dot_len; + return DIRECTORY_UP; + } + if (IsURLSlash(spec[after_second_dot])) { + // Double dot followed by a slash. + *consumed_len = second_dot_len + 1; + return DIRECTORY_UP; + } + } + + // The dots are followed by something else, not a directory. + *consumed_len = 0; + return NOT_A_DIRECTORY; +} + +// Rewinds the output to the previous slash. It is assumed that the output +// ends with a slash and this doesn't count (we call this when we are +// appending directory paths, so the previous path component has and ending +// slash). +// +// This will stop at the first slash (assumed to be at position +// |path_begin_in_output| and not go any higher than that. Some web pages +// do ".." too many times, so we need to handle that brokenness. +// +// It searches for a literal slash rather than including a backslash as well +// because it is run only on the canonical output. +// +// The output is guaranteed to end in a slash when this function completes. +void BackUpToPreviousSlash(int path_begin_in_output, + CanonOutput* output) { + GURL_DCHECK(output->length() > 0); + + int i = output->length() - 1; + GURL_DCHECK(output->at(i) == '/'); + if (i == path_begin_in_output) + return; // We're at the first slash, nothing to do. + + // Now back up (skipping the trailing slash) until we find another slash. + i--; + while (output->at(i) != '/' && i > path_begin_in_output) + i--; + + // Now shrink the output to just include that last slash we found. + output->set_length(i + 1); +} + +// Looks for problematic nested escape sequences and escapes the output as +// needed to ensure they can't be misinterpreted. +// +// Our concern is that in input escape sequence that's invalid because it +// contains nested escape sequences might look valid once those are unescaped. +// For example, "%%300" is not a valid escape sequence, but after unescaping the +// inner "%30" this becomes "%00" which is valid. Leaving this in the output +// string can result in callers re-canonicalizing the string and unescaping this +// sequence, thus resulting in something fundamentally different than the +// original input here. This can cause a variety of problems. +// +// This function is called after we've just unescaped a sequence that's within +// two output characters of a previous '%' that we know didn't begin a valid +// escape sequence in the input string. We look for whether the output is going +// to turn into a valid escape sequence, and if so, convert the initial '%' into +// an escaped "%25" so the output can't be misinterpreted. +// +// |spec| is the input string we're canonicalizing. +// |next_input_index| is the index of the next unprocessed character in |spec|. +// |input_len| is the length of |spec|. +// |last_invalid_percent_index| is the index in |output| of a previously-seen +// '%' character. The caller knows this '%' character isn't followed by a valid +// escape sequence in the input string. +// |output| is the canonicalized output thus far. The caller guarantees this +// ends with a '%' followed by one or two characters, and the '%' is the one +// pointed to by |last_invalid_percent_index|. The last character in the string +// was just unescaped. +template<typename CHAR> +void CheckForNestedEscapes(const CHAR* spec, + int next_input_index, + int input_len, + int last_invalid_percent_index, + CanonOutput* output) { + const int length = output->length(); + const char last_unescaped_char = output->at(length - 1); + + // If |output| currently looks like "%c", we need to try appending the next + // input character to see if this will result in a problematic escape + // sequence. Note that this won't trigger on the first nested escape of a + // two-escape sequence like "%%30%30" -- we'll allow the conversion to + // "%0%30" -- but the second nested escape will be caught by this function + // when it's called again in that case. + const bool append_next_char = last_invalid_percent_index == length - 2; + if (append_next_char) { + // If the input doesn't contain a 7-bit character next, this case won't be a + // problem. + if ((next_input_index == input_len) || (spec[next_input_index] >= 0x80)) + return; + output->push_back(static_cast<char>(spec[next_input_index])); + } + + // Now output ends like "%cc". Try to unescape this. + int begin = last_invalid_percent_index; + unsigned char temp; + if (DecodeEscaped(output->data(), &begin, output->length(), &temp)) { + // New escape sequence found. Overwrite the characters following the '%' + // with "25", and push_back() the one or two characters that were following + // the '%' when we were called. + if (!append_next_char) + output->push_back(output->at(last_invalid_percent_index + 1)); + output->set(last_invalid_percent_index + 1, '2'); + output->set(last_invalid_percent_index + 2, '5'); + output->push_back(last_unescaped_char); + } else if (append_next_char) { + // Not a valid escape sequence, but we still need to undo appending the next + // source character so the caller can process it normally. + output->set_length(length); + } +} + +// Appends the given path to the output. It assumes that if the input path +// starts with a slash, it should be copied to the output. If no path has +// already been appended to the output (the case when not resolving +// relative URLs), the path should begin with a slash. +// +// If there are already path components (this mode is used when appending +// relative paths for resolving), it assumes that the output already has +// a trailing slash and that if the input begins with a slash, it should be +// copied to the output. +// +// We do not collapse multiple slashes in a row to a single slash. It seems +// no web browsers do this, and we don't want incompatibilities, even though +// it would be correct for most systems. +template<typename CHAR, typename UCHAR> +bool DoPartialPath(const CHAR* spec, + const Component& path, + int path_begin_in_output, + CanonOutput* output) { + int end = path.end(); + + // We use this variable to minimize the amount of work done when unescaping -- + // we'll only call CheckForNestedEscapes() when this points at one of the last + // couple of characters in |output|. + int last_invalid_percent_index = INT_MIN; + + bool success = true; + for (int i = path.begin; i < end; i++) { + UCHAR uch = static_cast<UCHAR>(spec[i]); + if (sizeof(CHAR) > 1 && uch >= 0x80) { + // We only need to test wide input for having non-ASCII characters. For + // narrow input, we'll always just use the lookup table. We don't try to + // do anything tricky with decoding/validating UTF-8. This function will + // read one or two UTF-16 characters and append the output as UTF-8. This + // call will be removed in 8-bit mode. + success &= AppendUTF8EscapedChar(spec, &i, end, output); + } else { + // Normal ASCII character or 8-bit input, use the lookup table. + unsigned char out_ch = static_cast<unsigned char>(uch); + unsigned char flags = kPathCharLookup[out_ch]; + if (flags & SPECIAL) { + // Needs special handling of some sort. + int dotlen; + if ((dotlen = IsDot(spec, i, end)) > 0) { + // See if this dot was preceded by a slash in the output. We + // assume that when canonicalizing paths, they will always + // start with a slash and not a dot, so we don't have to + // bounds check the output. + // + // Note that we check this in the case of dots so we don't have to + // special case slashes. Since slashes are much more common than + // dots, this actually increases performance measurably (though + // slightly). + GURL_DCHECK(output->length() > path_begin_in_output); + if (output->length() > path_begin_in_output && + output->at(output->length() - 1) == '/') { + // Slash followed by a dot, check to see if this is means relative + int consumed_len; + switch (ClassifyAfterDot<CHAR>(spec, i + dotlen, end, + &consumed_len)) { + case NOT_A_DIRECTORY: + // Copy the dot to the output, it means nothing special. + output->push_back('.'); + i += dotlen - 1; + break; + case DIRECTORY_CUR: // Current directory, just skip the input. + i += dotlen + consumed_len - 1; + break; + case DIRECTORY_UP: + BackUpToPreviousSlash(path_begin_in_output, output); + i += dotlen + consumed_len - 1; + break; + } + } else { + // This dot is not preceded by a slash, it is just part of some + // file name. + output->push_back('.'); + i += dotlen - 1; + } + + } else if (out_ch == '\\') { + // Convert backslashes to forward slashes + output->push_back('/'); + + } else if (out_ch == '%') { + // Handle escape sequences. + unsigned char unescaped_value; + if (DecodeEscaped(spec, &i, end, &unescaped_value)) { + // Valid escape sequence, see if we keep, reject, or unescape it. + // Note that at this point DecodeEscape() will have advanced |i| to + // the last character of the escape sequence. + char unescaped_flags = kPathCharLookup[unescaped_value]; + + if (unescaped_flags & UNESCAPE) { + // This escaped value shouldn't be escaped. Try to copy it. + output->push_back(unescaped_value); + // If we just unescaped a value within 2 output characters of the + // '%' from a previously-detected invalid escape sequence, we + // might have an input string with problematic nested escape + // sequences; detect and fix them. + if (last_invalid_percent_index >= (output->length() - 3)) { + CheckForNestedEscapes(spec, i + 1, end, + last_invalid_percent_index, output); + } + } else { + // Either this is an invalid escaped character, or it's a valid + // escaped character we should keep escaped. In the first case we + // should just copy it exactly and remember the error. In the + // second we also copy exactly in case the server is sensitive to + // changing the case of any hex letters. + output->push_back('%'); + output->push_back(static_cast<char>(spec[i - 1])); + output->push_back(static_cast<char>(spec[i])); + if (unescaped_flags & INVALID_BIT) + success = false; + } + } else { + // Invalid escape sequence. IE7+ rejects any URLs with such + // sequences, while other browsers pass them through unchanged. We + // use the permissive behavior. + // TODO(brettw): Consider testing IE's strict behavior, which would + // allow removing the code to handle nested escapes above. + last_invalid_percent_index = output->length(); + output->push_back('%'); + } + + } else if (flags & INVALID_BIT) { + // For NULLs, etc. fail. + AppendEscapedChar(out_ch, output); + success = false; + + } else if (flags & ESCAPE_BIT) { + // This character should be escaped. + AppendEscapedChar(out_ch, output); + } + } else { + // Nothing special about this character, just append it. + output->push_back(out_ch); + } + } + } + return success; +} + +template<typename CHAR, typename UCHAR> +bool DoPath(const CHAR* spec, + const Component& path, + CanonOutput* output, + Component* out_path) { + bool success = true; + out_path->begin = output->length(); + if (path.len > 0) { + // Write out an initial slash if the input has none. If we just parse a URL + // and then canonicalize it, it will of course have a slash already. This + // check is for the replacement and relative URL resolving cases of file + // URLs. + if (!IsURLSlash(spec[path.begin])) + output->push_back('/'); + + success = DoPartialPath<CHAR, UCHAR>(spec, path, out_path->begin, output); + } else { + // No input, canonical path is a slash. + output->push_back('/'); + } + out_path->len = output->length() - out_path->begin; + return success; +} + +} // namespace + +bool CanonicalizePath(const char* spec, + const Component& path, + CanonOutput* output, + Component* out_path) { + return DoPath<char, unsigned char>(spec, path, output, out_path); +} + +bool CanonicalizePath(const gurl_base::char16* spec, + const Component& path, + CanonOutput* output, + Component* out_path) { + return DoPath<gurl_base::char16, gurl_base::char16>(spec, path, output, out_path); +} + +bool CanonicalizePartialPath(const char* spec, + const Component& path, + int path_begin_in_output, + CanonOutput* output) { + return DoPartialPath<char, unsigned char>(spec, path, path_begin_in_output, + output); +} + +bool CanonicalizePartialPath(const gurl_base::char16* spec, + const Component& path, + int path_begin_in_output, + CanonOutput* output) { + return DoPartialPath<gurl_base::char16, gurl_base::char16>(spec, path, + path_begin_in_output, + output); +} + +} // namespace url
diff --git a/url/url_canon_pathurl.cc b/url/url_canon_pathurl.cc new file mode 100644 index 0000000..62fe22f --- /dev/null +++ b/url/url_canon_pathurl.cc
@@ -0,0 +1,122 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Functions for canonicalizing "path" URLs. Not to be confused with the path +// of a URL, these are URLs that have no authority section, only a path. For +// example, "javascript:" and "data:". + +#include "url/url_canon.h" +#include "url/url_canon_internal.h" + +namespace url { + +namespace { + +// Canonicalize the given |component| from |source| into |output| and +// |new_component|. If |separator| is non-zero, it is pre-pended to |output| +// prior to the canonicalized component; i.e. for the '?' or '#' characters. +template <typename CHAR, typename UCHAR> +void DoCanonicalizePathComponent(const CHAR* source, + const Component& component, + char separator, + CanonOutput* output, + Component* new_component) { + if (component.is_valid()) { + if (separator) + output->push_back(separator); + // Copy the path using path URL's more lax escaping rules (think for + // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all + // ASCII characters alone. This helps readability of JavaStript. + new_component->begin = output->length(); + int end = component.end(); + for (int i = component.begin; i < end; i++) { + UCHAR uch = static_cast<UCHAR>(source[i]); + if (uch < 0x20 || uch >= 0x80) + AppendUTF8EscapedChar(source, &i, end, output); + else + output->push_back(static_cast<char>(uch)); + } + new_component->len = output->length() - new_component->begin; + } else { + // Empty part. + new_component->reset(); + } +} + +template <typename CHAR, typename UCHAR> +bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed) { + // Scheme: this will append the colon. + bool success = CanonicalizeScheme(source.scheme, parsed.scheme, + output, &new_parsed->scheme); + + // We assume there's no authority for path URLs. Note that hosts should never + // have -1 length. + new_parsed->username.reset(); + new_parsed->password.reset(); + new_parsed->host.reset(); + new_parsed->port.reset(); + // We allow path URLs to have the path, query and fragment components, but we + // will canonicalize each of the via the weaker path URL rules. + // + // Note: parsing the path part should never cause a failure, see + // https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state + DoCanonicalizePathComponent<CHAR, UCHAR>(source.path, parsed.path, '\0', + output, &new_parsed->path); + DoCanonicalizePathComponent<CHAR, UCHAR>(source.query, parsed.query, '?', + output, &new_parsed->query); + DoCanonicalizePathComponent<CHAR, UCHAR>(source.ref, parsed.ref, '#', output, + &new_parsed->ref); + + return success; +} + +} // namespace + +bool CanonicalizePathURL(const char* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizePathURL<char, unsigned char>( + URLComponentSource<char>(spec), parsed, output, new_parsed); +} + +bool CanonicalizePathURL(const gurl_base::char16* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizePathURL<gurl_base::char16, gurl_base::char16>( + URLComponentSource<gurl_base::char16>(spec), parsed, output, new_parsed); +} + +bool ReplacePathURL(const char* base, + const Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + Parsed* new_parsed) { + URLComponentSource<char> source(base); + Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizePathURL<char, unsigned char>( + source, parsed, output, new_parsed); +} + +bool ReplacePathURL(const char* base, + const Parsed& base_parsed, + const Replacements<gurl_base::char16>& replacements, + CanonOutput* output, + Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource<char> source(base); + Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizePathURL<char, unsigned char>( + source, parsed, output, new_parsed); +} + +} // namespace url
diff --git a/url/url_canon_query.cc b/url/url_canon_query.cc new file mode 100644 index 0000000..99b8ed8 --- /dev/null +++ b/url/url_canon_query.cc
@@ -0,0 +1,164 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_canon.h" +#include "url/url_canon_internal.h" + +// Query canonicalization in IE +// ---------------------------- +// IE is very permissive for query parameters specified in links on the page +// (in contrast to links that it constructs itself based on form data). It does +// not unescape any character. It does not reject any escape sequence (be they +// invalid like "%2y" or freaky like %00). +// +// IE only escapes spaces and nothing else. Embedded NULLs, tabs (0x09), +// LF (0x0a), and CR (0x0d) are removed (this probably happens at an earlier +// layer since they are removed from all portions of the URL). All other +// characters are passed unmodified. Invalid UTF-16 sequences are preserved as +// well, with each character in the input being converted to UTF-8. It is the +// server's job to make sense of this invalid query. +// +// Invalid multibyte sequences (for example, invalid UTF-8 on a UTF-8 page) +// are converted to the invalid character and sent as unescaped UTF-8 (0xef, +// 0xbf, 0xbd). This may not be canonicalization, the parser may generate these +// strings before the URL handler ever sees them. +// +// Our query canonicalization +// -------------------------- +// We escape all non-ASCII characters and control characters, like Firefox. +// This is more conformant to the URL spec, and there do not seem to be many +// problems relating to Firefox's behavior. +// +// Like IE, we will never unescape (although the application may want to try +// unescaping to present the user with a more understandable URL). We will +// replace all invalid sequences (including invalid UTF-16 sequences, which IE +// doesn't) with the "invalid character," and we will escape it. + +namespace url { + +namespace { + +// Returns true if the characters starting at |begin| and going until |end| +// (non-inclusive) are all representable in 7-bits. +template<typename CHAR, typename UCHAR> +bool IsAllASCII(const CHAR* spec, const Component& query) { + int end = query.end(); + for (int i = query.begin; i < end; i++) { + if (static_cast<UCHAR>(spec[i]) >= 0x80) + return false; + } + return true; +} + +// Appends the given string to the output, escaping characters that do not +// match the given |type| in SharedCharTypes. This version will accept 8 or 16 +// bit characters, but assumes that they have only 7-bit values. It also assumes +// that all UTF-8 values are correct, so doesn't bother checking +template<typename CHAR> +void AppendRaw8BitQueryString(const CHAR* source, int length, + CanonOutput* output) { + for (int i = 0; i < length; i++) { + if (!IsQueryChar(static_cast<unsigned char>(source[i]))) + AppendEscapedChar(static_cast<unsigned char>(source[i]), output); + else // Doesn't need escaping. + output->push_back(static_cast<char>(source[i])); + } +} + +// Runs the converter on the given UTF-8 input. Since the converter expects +// UTF-16, we have to convert first. The converter must be non-NULL. +void RunConverter(const char* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output) { + // This function will replace any misencoded values with the invalid + // character. This is what we want so we don't have to check for error. + RawCanonOutputW<1024> utf16; + ConvertUTF8ToUTF16(&spec[query.begin], query.len, &utf16); + converter->ConvertFromUTF16(utf16.data(), utf16.length(), output); +} + +// Runs the converter with the given UTF-16 input. We don't have to do +// anything, but this overridden function allows us to use the same code +// for both UTF-8 and UTF-16 input. +void RunConverter(const gurl_base::char16* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output) { + converter->ConvertFromUTF16(&spec[query.begin], query.len, output); +} + +template<typename CHAR, typename UCHAR> +void DoConvertToQueryEncoding(const CHAR* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output) { + if (IsAllASCII<CHAR, UCHAR>(spec, query)) { + // Easy: the input can just appended with no character set conversions. + AppendRaw8BitQueryString(&spec[query.begin], query.len, output); + + } else { + // Harder: convert to the proper encoding first. + if (converter) { + // Run the converter to get an 8-bit string, then append it, escaping + // necessary values. + RawCanonOutput<1024> eight_bit; + RunConverter(spec, query, converter, &eight_bit); + AppendRaw8BitQueryString(eight_bit.data(), eight_bit.length(), output); + + } else { + // No converter, do our own UTF-8 conversion. + AppendStringOfType(&spec[query.begin], query.len, CHAR_QUERY, output); + } + } +} + +template<typename CHAR, typename UCHAR> +void DoCanonicalizeQuery(const CHAR* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output, + Component* out_query) { + if (query.len < 0) { + *out_query = Component(); + return; + } + + output->push_back('?'); + out_query->begin = output->length(); + + DoConvertToQueryEncoding<CHAR, UCHAR>(spec, query, converter, output); + + out_query->len = output->length() - out_query->begin; +} + +} // namespace + +void CanonicalizeQuery(const char* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output, + Component* out_query) { + DoCanonicalizeQuery<char, unsigned char>(spec, query, converter, + output, out_query); +} + +void CanonicalizeQuery(const gurl_base::char16* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output, + Component* out_query) { + DoCanonicalizeQuery<gurl_base::char16, gurl_base::char16>(spec, query, converter, + output, out_query); +} + +void ConvertUTF16ToQueryEncoding(const gurl_base::char16* input, + const Component& query, + CharsetConverter* converter, + CanonOutput* output) { + DoConvertToQueryEncoding<gurl_base::char16, gurl_base::char16>(input, query, + converter, output); +} + +} // namespace url
diff --git a/url/url_canon_relative.cc b/url/url_canon_relative.cc new file mode 100644 index 0000000..47668f6 --- /dev/null +++ b/url/url_canon_relative.cc
@@ -0,0 +1,589 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Canonicalizer functions for working with and resolving relative URLs. + +#include <algorithm> + +#include "polyfills/base/logging.h" +#include "url/url_canon.h" +#include "url/url_canon_internal.h" +#include "url/url_constants.h" +#include "url/url_file.h" +#include "url/url_parse_internal.h" +#include "url/url_util.h" +#include "url/url_util_internal.h" + +namespace url { + +namespace { + +// Firefox does a case-sensitive compare (which is probably wrong--Mozilla bug +// 379034), whereas IE is case-insensitive. +// +// We choose to be more permissive like IE. We don't need to worry about +// unescaping or anything here: neither IE or Firefox allow this. We also +// don't have to worry about invalid scheme characters since we are comparing +// against the canonical scheme of the base. +// +// The base URL should always be canonical, therefore it should be ASCII. +template<typename CHAR> +bool AreSchemesEqual(const char* base, + const Component& base_scheme, + const CHAR* cmp, + const Component& cmp_scheme) { + if (base_scheme.len != cmp_scheme.len) + return false; + for (int i = 0; i < base_scheme.len; i++) { + // We assume the base is already canonical, so we don't have to + // canonicalize it. + if (CanonicalSchemeChar(cmp[cmp_scheme.begin + i]) != + base[base_scheme.begin + i]) + return false; + } + return true; +} + +#ifdef WIN32 + +// Here, we also allow Windows paths to be represented as "/C:/" so we can be +// consistent about URL paths beginning with slashes. This function is like +// DoesBeginWindowsDrivePath except that it also requires a slash at the +// beginning. +template<typename CHAR> +bool DoesBeginSlashWindowsDriveSpec(const CHAR* spec, int start_offset, + int spec_len) { + if (start_offset >= spec_len) + return false; + return IsURLSlash(spec[start_offset]) && + DoesBeginWindowsDriveSpec(spec, start_offset + 1, spec_len); +} + +#endif // WIN32 + +// See IsRelativeURL in the header file for usage. +template<typename CHAR> +bool DoIsRelativeURL(const char* base, + const Parsed& base_parsed, + const CHAR* url, + int url_len, + bool is_base_hierarchical, + bool* is_relative, + Component* relative_component) { + *is_relative = false; // So we can default later to not relative. + + // Trim whitespace and construct a new range for the substring. + int begin = 0; + TrimURL(url, &begin, &url_len); + if (begin >= url_len) { + // Empty URLs are relative, but do nothing. + if (!is_base_hierarchical) { + // Don't allow relative URLs if the base scheme doesn't support it. + return false; + } + *relative_component = Component(begin, 0); + *is_relative = true; + return true; + } + +#ifdef WIN32 + // We special case paths like "C:\foo" so they can link directly to the + // file on Windows (IE compatibility). The security domain stuff should + // prevent a link like this from actually being followed if its on a + // web page. + // + // We treat "C:/foo" as an absolute URL. We can go ahead and treat "/c:/" + // as relative, as this will just replace the path when the base scheme + // is a file and the answer will still be correct. + // + // We require strict backslashes when detecting UNC since two forward + // slashes should be treated a a relative URL with a hostname. + if (DoesBeginWindowsDriveSpec(url, begin, url_len) || + DoesBeginUNCPath(url, begin, url_len, true)) + return true; +#endif // WIN32 + + // See if we've got a scheme, if not, we know this is a relative URL. + // BUT, just because we have a scheme, doesn't make it absolute. + // "http:foo.html" is a relative URL with path "foo.html". If the scheme is + // empty, we treat it as relative (":foo"), like IE does. + Component scheme; + const bool scheme_is_empty = + !ExtractScheme(url, url_len, &scheme) || scheme.len == 0; + if (scheme_is_empty) { + if (url[begin] == '#') { + // |url| is a bare fragment (e.g. "#foo"). This can be resolved against + // any base. Fall-through. + } else if (!is_base_hierarchical) { + // Don't allow relative URLs if the base scheme doesn't support it. + return false; + } + + *relative_component = MakeRange(begin, url_len); + *is_relative = true; + return true; + } + + // If the scheme isn't valid, then it's relative. + int scheme_end = scheme.end(); + for (int i = scheme.begin; i < scheme_end; i++) { + if (!CanonicalSchemeChar(url[i])) { + if (!is_base_hierarchical) { + // Don't allow relative URLs if the base scheme doesn't support it. + return false; + } + *relative_component = MakeRange(begin, url_len); + *is_relative = true; + return true; + } + } + + // If the scheme is not the same, then we can't count it as relative. + if (!AreSchemesEqual(base, base_parsed.scheme, url, scheme)) + return true; + + // When the scheme that they both share is not hierarchical, treat the + // incoming scheme as absolute (this way with the base of "data:foo", + // "data:bar" will be reported as absolute. + if (!is_base_hierarchical) + return true; + + int colon_offset = scheme.end(); + + // If it's a filesystem URL, the only valid way to make it relative is not to + // supply a scheme. There's no equivalent to e.g. http:index.html. + if (CompareSchemeComponent(url, scheme, kFileSystemScheme)) + return true; + + // ExtractScheme guarantees that the colon immediately follows what it + // considers to be the scheme. CountConsecutiveSlashes will handle the + // case where the begin offset is the end of the input. + int num_slashes = CountConsecutiveSlashes(url, colon_offset + 1, url_len); + + if (num_slashes == 0 || num_slashes == 1) { + // No slashes means it's a relative path like "http:foo.html". One slash + // is an absolute path. "http:/home/foo.html" + *is_relative = true; + *relative_component = MakeRange(colon_offset + 1, url_len); + return true; + } + + // Two or more slashes after the scheme we treat as absolute. + return true; +} + +// Copies all characters in the range [begin, end) of |spec| to the output, +// up until and including the last slash. There should be a slash in the +// range, if not, nothing will be copied. +// +// For stardard URLs the input should be canonical, but when resolving relative +// URLs on a non-standard base (like "data:") the input can be anything. +void CopyToLastSlash(const char* spec, + int begin, + int end, + CanonOutput* output) { + // Find the last slash. + int last_slash = -1; + for (int i = end - 1; i >= begin; i--) { + if (spec[i] == '/' || spec[i] == '\\') { + last_slash = i; + break; + } + } + if (last_slash < 0) + return; // No slash. + + // Copy. + for (int i = begin; i <= last_slash; i++) + output->push_back(spec[i]); +} + +// Copies a single component from the source to the output. This is used +// when resolving relative URLs and a given component is unchanged. Since the +// source should already be canonical, we don't have to do anything special, +// and the input is ASCII. +void CopyOneComponent(const char* source, + const Component& source_component, + CanonOutput* output, + Component* output_component) { + if (source_component.len < 0) { + // This component is not present. + *output_component = Component(); + return; + } + + output_component->begin = output->length(); + int source_end = source_component.end(); + for (int i = source_component.begin; i < source_end; i++) + output->push_back(source[i]); + output_component->len = output->length() - output_component->begin; +} + +#ifdef WIN32 + +// Called on Windows when the base URL is a file URL, this will copy the "C:" +// to the output, if there is a drive letter and if that drive letter is not +// being overridden by the relative URL. Otherwise, do nothing. +// +// It will return the index of the beginning of the next character in the +// base to be processed: if there is a "C:", the slash after it, or if +// there is no drive letter, the slash at the beginning of the path, or +// the end of the base. This can be used as the starting offset for further +// path processing. +template<typename CHAR> +int CopyBaseDriveSpecIfNecessary(const char* base_url, + int base_path_begin, + int base_path_end, + const CHAR* relative_url, + int path_start, + int relative_url_len, + CanonOutput* output) { + if (base_path_begin >= base_path_end) + return base_path_begin; // No path. + + // If the relative begins with a drive spec, don't do anything. The existing + // drive spec in the base will be replaced. + if (DoesBeginWindowsDriveSpec(relative_url, path_start, relative_url_len)) { + return base_path_begin; // Relative URL path is "C:/foo" + } + + // The path should begin with a slash (as all canonical paths do). We check + // if it is followed by a drive letter and copy it. + if (DoesBeginSlashWindowsDriveSpec(base_url, + base_path_begin, + base_path_end)) { + // Copy the two-character drive spec to the output. It will now look like + // "file:///C:" so the rest of it can be treated like a standard path. + output->push_back('/'); + output->push_back(base_url[base_path_begin + 1]); + output->push_back(base_url[base_path_begin + 2]); + return base_path_begin + 3; + } + + return base_path_begin; +} + +#endif // WIN32 + +// A subroutine of DoResolveRelativeURL, this resolves the URL knowning that +// the input is a relative path or less (query or ref). +template<typename CHAR> +bool DoResolveRelativePath(const char* base_url, + const Parsed& base_parsed, + bool base_is_file, + const CHAR* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed) { + bool success = true; + + // We know the authority section didn't change, copy it to the output. We + // also know we have a path so can copy up to there. + Component path, query, ref; + ParsePathInternal(relative_url, relative_component, &path, &query, &ref); + + // Canonical URLs always have a path, so we can use that offset. Reserve + // enough room for the base URL, the new path, and some extra bytes for + // possible escaped characters. + output->ReserveSizeIfNeeded( + base_parsed.path.begin + + std::max(path.end(), std::max(query.end(), ref.end()))); + output->Append(base_url, base_parsed.path.begin); + + if (path.len > 0) { + // The path is replaced or modified. + int true_path_begin = output->length(); + + // For file: URLs on Windows, we don't want to treat the drive letter and + // colon as part of the path for relative file resolution when the + // incoming URL does not provide a drive spec. We save the true path + // beginning so we can fix it up after we are done. + int base_path_begin = base_parsed.path.begin; +#ifdef WIN32 + if (base_is_file) { + base_path_begin = CopyBaseDriveSpecIfNecessary( + base_url, base_parsed.path.begin, base_parsed.path.end(), + relative_url, relative_component.begin, relative_component.end(), + output); + // Now the output looks like either "file://" or "file:///C:" + // and we can start appending the rest of the path. |base_path_begin| + // points to the character in the base that comes next. + } +#endif // WIN32 + + if (IsURLSlash(relative_url[path.begin])) { + // Easy case: the path is an absolute path on the server, so we can + // just replace everything from the path on with the new versions. + // Since the input should be canonical hierarchical URL, we should + // always have a path. + success &= CanonicalizePath(relative_url, path, + output, &out_parsed->path); + } else { + // Relative path, replace the query, and reference. We take the + // original path with the file part stripped, and append the new path. + // The canonicalizer will take care of resolving ".." and "." + int path_begin = output->length(); + CopyToLastSlash(base_url, base_path_begin, base_parsed.path.end(), + output); + success &= CanonicalizePartialPath(relative_url, path, path_begin, + output); + out_parsed->path = MakeRange(path_begin, output->length()); + + // Copy the rest of the stuff after the path from the relative path. + } + + // Finish with the query and reference part (these can't fail). + CanonicalizeQuery(relative_url, query, query_converter, + output, &out_parsed->query); + CanonicalizeRef(relative_url, ref, output, &out_parsed->ref); + + // Fix the path beginning to add back the "C:" we may have written above. + out_parsed->path = MakeRange(true_path_begin, out_parsed->path.end()); + return success; + } + + // If we get here, the path is unchanged: copy to output. + CopyOneComponent(base_url, base_parsed.path, output, &out_parsed->path); + + if (query.is_valid()) { + // Just the query specified, replace the query and reference (ignore + // failures for refs) + CanonicalizeQuery(relative_url, query, query_converter, + output, &out_parsed->query); + CanonicalizeRef(relative_url, ref, output, &out_parsed->ref); + return success; + } + + // If we get here, the query is unchanged: copy to output. Note that the + // range of the query parameter doesn't include the question mark, so we + // have to add it manually if there is a component. + if (base_parsed.query.is_valid()) + output->push_back('?'); + CopyOneComponent(base_url, base_parsed.query, output, &out_parsed->query); + + if (ref.is_valid()) { + // Just the reference specified: replace it (ignoring failures). + CanonicalizeRef(relative_url, ref, output, &out_parsed->ref); + return success; + } + + // We should always have something to do in this function, the caller checks + // that some component is being replaced. + GURL_DCHECK(false) << "Not reached"; + return success; +} + +// Resolves a relative URL that contains a host. Typically, these will +// be of the form "//www.google.com/foo/bar?baz#ref" and the only thing which +// should be kept from the original URL is the scheme. +template<typename CHAR> +bool DoResolveRelativeHost(const char* base_url, + const Parsed& base_parsed, + const CHAR* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed) { + // Parse the relative URL, just like we would for anything following a + // scheme. + Parsed relative_parsed; // Everything but the scheme is valid. + ParseAfterScheme(relative_url, relative_component.end(), + relative_component.begin, &relative_parsed); + + // Now we can just use the replacement function to replace all the necessary + // parts of the old URL with the new one. + Replacements<CHAR> replacements; + replacements.SetUsername(relative_url, relative_parsed.username); + replacements.SetPassword(relative_url, relative_parsed.password); + replacements.SetHost(relative_url, relative_parsed.host); + replacements.SetPort(relative_url, relative_parsed.port); + replacements.SetPath(relative_url, relative_parsed.path); + replacements.SetQuery(relative_url, relative_parsed.query); + replacements.SetRef(relative_url, relative_parsed.ref); + + // Length() does not include the old scheme, so make sure to add it from the + // base URL. + output->ReserveSizeIfNeeded( + replacements.components().Length() + + base_parsed.CountCharactersBefore(Parsed::USERNAME, false)); + SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + if (!GetStandardSchemeType(base_url, base_parsed.scheme, &scheme_type)) { + // A path with an authority section gets canonicalized under standard URL + // rules, even though the base was not known to be standard. + scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + } + return ReplaceStandardURL(base_url, base_parsed, replacements, scheme_type, + query_converter, output, out_parsed); +} + +// Resolves a relative URL that happens to be an absolute file path. Examples +// include: "//hostname/path", "/c:/foo", and "//hostname/c:/foo". +template<typename CHAR> +bool DoResolveAbsoluteFile(const CHAR* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed) { + // Parse the file URL. The file URl parsing function uses the same logic + // as we do for determining if the file is absolute, in which case it will + // not bother to look for a scheme. + Parsed relative_parsed; + ParseFileURL(&relative_url[relative_component.begin], relative_component.len, + &relative_parsed); + + return CanonicalizeFileURL(&relative_url[relative_component.begin], + relative_component.len, relative_parsed, + query_converter, output, out_parsed); +} + +// TODO(brettw) treat two slashes as root like Mozilla for FTP? +template<typename CHAR> +bool DoResolveRelativeURL(const char* base_url, + const Parsed& base_parsed, + bool base_is_file, + const CHAR* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed) { + // |base_parsed| is the starting point for our output. Since we may have + // removed whitespace from |relative_url| before entering this method, we'll + // carry over the |potentially_dangling_markup| flag. + bool potentially_dangling_markup = out_parsed->potentially_dangling_markup; + *out_parsed = base_parsed; + if (potentially_dangling_markup) + out_parsed->potentially_dangling_markup = true; + + // Sanity check: the input should have a host or we'll break badly below. + // We can only resolve relative URLs with base URLs that have hosts and + // paths (even the default path of "/" is OK). + // + // We allow hosts with no length so we can handle file URLs, for example. + if (base_parsed.path.len <= 0) { + // On error, return the input (resolving a relative URL on a non-relative + // base = the base). + int base_len = base_parsed.Length(); + for (int i = 0; i < base_len; i++) + output->push_back(base_url[i]); + return false; + } + + if (relative_component.len <= 0) { + // Empty relative URL, leave unchanged, only removing the ref component. + int base_len = base_parsed.Length(); + base_len -= base_parsed.ref.len + 1; + out_parsed->ref.reset(); + output->Append(base_url, base_len); + return true; + } + + int num_slashes = CountConsecutiveSlashes( + relative_url, relative_component.begin, relative_component.end()); + +#ifdef WIN32 + // On Windows, two slashes for a file path (regardless of which direction + // they are) means that it's UNC. Two backslashes on any base scheme mean + // that it's an absolute UNC path (we use the base_is_file flag to control + // how strict the UNC finder is). + // + // We also allow Windows absolute drive specs on any scheme (for example + // "c:\foo") like IE does. There must be no preceding slashes in this + // case (we reject anything like "/c:/foo") because that should be treated + // as a path. For file URLs, we allow any number of slashes since that would + // be setting the path. + // + // This assumes the absolute path resolver handles absolute URLs like this + // properly. DoCanonicalize does this. + int after_slashes = relative_component.begin + num_slashes; + if (DoesBeginUNCPath(relative_url, relative_component.begin, + relative_component.end(), !base_is_file) || + ((num_slashes == 0 || base_is_file) && + DoesBeginWindowsDriveSpec( + relative_url, after_slashes, relative_component.end()))) { + return DoResolveAbsoluteFile(relative_url, relative_component, + query_converter, output, out_parsed); + } +#else + // Other platforms need explicit handling for file: URLs with multiple + // slashes because the generic scheme parsing always extracts a host, but a + // file: URL only has a host if it has exactly 2 slashes. Even if it does + // have a host, we want to use the special host detection logic for file + // URLs provided by DoResolveAbsoluteFile(), as opposed to the generic host + // detection logic, for consistency with parsing file URLs from scratch. + // This also handles the special case where the URL is only slashes, + // since that doesn't have a host part either. + if (base_is_file && + (num_slashes >= 2 || num_slashes == relative_component.len)) { + return DoResolveAbsoluteFile(relative_url, relative_component, + query_converter, output, out_parsed); + } +#endif + + // Any other double-slashes mean that this is relative to the scheme. + if (num_slashes >= 2) { + return DoResolveRelativeHost(base_url, base_parsed, + relative_url, relative_component, + query_converter, output, out_parsed); + } + + // When we get here, we know that the relative URL is on the same host. + return DoResolveRelativePath(base_url, base_parsed, base_is_file, + relative_url, relative_component, + query_converter, output, out_parsed); +} + +} // namespace + +bool IsRelativeURL(const char* base, + const Parsed& base_parsed, + const char* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + Component* relative_component) { + return DoIsRelativeURL<char>( + base, base_parsed, fragment, fragment_len, is_base_hierarchical, + is_relative, relative_component); +} + +bool IsRelativeURL(const char* base, + const Parsed& base_parsed, + const gurl_base::char16* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + Component* relative_component) { + return DoIsRelativeURL<gurl_base::char16>( + base, base_parsed, fragment, fragment_len, is_base_hierarchical, + is_relative, relative_component); +} + +bool ResolveRelativeURL(const char* base_url, + const Parsed& base_parsed, + bool base_is_file, + const char* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed) { + return DoResolveRelativeURL<char>( + base_url, base_parsed, base_is_file, relative_url, + relative_component, query_converter, output, out_parsed); +} + +bool ResolveRelativeURL(const char* base_url, + const Parsed& base_parsed, + bool base_is_file, + const gurl_base::char16* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed) { + return DoResolveRelativeURL<gurl_base::char16>( + base_url, base_parsed, base_is_file, relative_url, + relative_component, query_converter, output, out_parsed); +} + +} // namespace url
diff --git a/url/url_canon_stdstring.cc b/url/url_canon_stdstring.cc new file mode 100644 index 0000000..c81a0a9 --- /dev/null +++ b/url/url_canon_stdstring.cc
@@ -0,0 +1,31 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_canon_stdstring.h" + +namespace url { + +StdStringCanonOutput::StdStringCanonOutput(std::string* str) + : CanonOutput(), str_(str) { + cur_len_ = static_cast<int>(str_->size()); // Append to existing data. + buffer_ = str_->empty() ? NULL : &(*str_)[0]; + buffer_len_ = static_cast<int>(str_->size()); +} + +StdStringCanonOutput::~StdStringCanonOutput() { + // Nothing to do, we don't own the string. +} + +void StdStringCanonOutput::Complete() { + str_->resize(cur_len_); + buffer_len_ = cur_len_; +} + +void StdStringCanonOutput::Resize(int sz) { + str_->resize(sz); + buffer_ = str_->empty() ? NULL : &(*str_)[0]; + buffer_len_ = sz; +} + +} // namespace url
diff --git a/url/url_canon_stdstring.h b/url/url_canon_stdstring.h new file mode 100644 index 0000000..82ee9db --- /dev/null +++ b/url/url_canon_stdstring.h
@@ -0,0 +1,88 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_CANON_STDSTRING_H_ +#define URL_URL_CANON_STDSTRING_H_ + +// This header file defines a canonicalizer output method class for STL +// strings. Because the canonicalizer tries not to be dependent on the STL, +// we have segregated it here. + +#include <string> + +#include "base/compiler_specific.h" +#include "polyfills/base/component_export.h" +#include "base/macros.h" +#include "base/strings/string_piece.h" +#include "url/url_canon.h" + +namespace url { + +// Write into a std::string given in the constructor. This object does not own +// the string itself, and the user must ensure that the string stays alive +// throughout the lifetime of this object. +// +// The given string will be appended to; any existing data in the string will +// be preserved. +// +// Note that when canonicalization is complete, the string will likely have +// unused space at the end because we make the string very big to start out +// with (by |initial_size|). This ends up being important because resize +// operations are slow, and because the base class needs to write directly +// into the buffer. +// +// Therefore, the user should call Complete() before using the string that +// this class wrote into. +class COMPONENT_EXPORT(URL) StdStringCanonOutput : public CanonOutput { + public: + StdStringCanonOutput(std::string* str); + ~StdStringCanonOutput() override; + + // Must be called after writing has completed but before the string is used. + void Complete(); + + void Resize(int sz) override; + + protected: + std::string* str_; + DISALLOW_COPY_AND_ASSIGN(StdStringCanonOutput); +}; + +// An extension of the Replacements class that allows the setters to use +// StringPieces (implicitly allowing strings or char*s). +// +// The contents of the StringPieces are not copied and must remain valid until +// the StringPieceReplacements object goes out of scope. +template<typename STR> +class StringPieceReplacements : public Replacements<typename STR::value_type> { + public: + void SetSchemeStr(const gurl_base::BasicStringPiece<STR>& s) { + this->SetScheme(s.data(), Component(0, static_cast<int>(s.length()))); + } + void SetUsernameStr(const gurl_base::BasicStringPiece<STR>& s) { + this->SetUsername(s.data(), Component(0, static_cast<int>(s.length()))); + } + void SetPasswordStr(const gurl_base::BasicStringPiece<STR>& s) { + this->SetPassword(s.data(), Component(0, static_cast<int>(s.length()))); + } + void SetHostStr(const gurl_base::BasicStringPiece<STR>& s) { + this->SetHost(s.data(), Component(0, static_cast<int>(s.length()))); + } + void SetPortStr(const gurl_base::BasicStringPiece<STR>& s) { + this->SetPort(s.data(), Component(0, static_cast<int>(s.length()))); + } + void SetPathStr(const gurl_base::BasicStringPiece<STR>& s) { + this->SetPath(s.data(), Component(0, static_cast<int>(s.length()))); + } + void SetQueryStr(const gurl_base::BasicStringPiece<STR>& s) { + this->SetQuery(s.data(), Component(0, static_cast<int>(s.length()))); + } + void SetRefStr(const gurl_base::BasicStringPiece<STR>& s) { + this->SetRef(s.data(), Component(0, static_cast<int>(s.length()))); + } +}; + +} // namespace url + +#endif // URL_URL_CANON_STDSTRING_H_
diff --git a/url/url_canon_stdurl.cc b/url/url_canon_stdurl.cc new file mode 100644 index 0000000..78f7773 --- /dev/null +++ b/url/url_canon_stdurl.cc
@@ -0,0 +1,207 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Functions to canonicalize "standard" URLs, which are ones that have an +// authority section including a host name. + +#include "url/url_canon.h" +#include "url/url_canon_internal.h" +#include "url/url_constants.h" + +namespace url { + +namespace { + +template <typename CHAR, typename UCHAR> +bool DoCanonicalizeStandardURL(const URLComponentSource<CHAR>& source, + const Parsed& parsed, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + // Scheme: this will append the colon. + bool success = CanonicalizeScheme(source.scheme, parsed.scheme, + output, &new_parsed->scheme); + + bool scheme_supports_user_info = + (scheme_type == SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION); + bool scheme_supports_ports = + (scheme_type == SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION || + scheme_type == SCHEME_WITH_HOST_AND_PORT); + + // Authority (username, password, host, port) + bool have_authority; + if ((scheme_supports_user_info && + (parsed.username.is_valid() || parsed.password.is_valid())) || + parsed.host.is_nonempty() || + (scheme_supports_ports && parsed.port.is_valid())) { + have_authority = true; + + // Only write the authority separators when we have a scheme. + if (parsed.scheme.is_valid()) { + output->push_back('/'); + output->push_back('/'); + } + + // User info: the canonicalizer will handle the : and @. + if (scheme_supports_user_info) { + success &= CanonicalizeUserInfo( + source.username, parsed.username, source.password, parsed.password, + output, &new_parsed->username, &new_parsed->password); + } else { + new_parsed->username.reset(); + new_parsed->password.reset(); + } + + success &= CanonicalizeHost(source.host, parsed.host, + output, &new_parsed->host); + + // Host must not be empty for standard URLs. + if (!parsed.host.is_nonempty()) + success = false; + + // Port: the port canonicalizer will handle the colon. + if (scheme_supports_ports) { + int default_port = DefaultPortForScheme( + &output->data()[new_parsed->scheme.begin], new_parsed->scheme.len); + success &= CanonicalizePort(source.port, parsed.port, default_port, + output, &new_parsed->port); + } else { + new_parsed->port.reset(); + } + } else { + // No authority, clear the components. + have_authority = false; + new_parsed->host.reset(); + new_parsed->username.reset(); + new_parsed->password.reset(); + new_parsed->port.reset(); + success = false; // Standard URLs must have an authority. + } + + // Path + if (parsed.path.is_valid()) { + success &= CanonicalizePath(source.path, parsed.path, + output, &new_parsed->path); + } else if (have_authority || + parsed.query.is_valid() || parsed.ref.is_valid()) { + // When we have an empty path, make up a path when we have an authority + // or something following the path. The only time we allow an empty + // output path is when there is nothing else. + new_parsed->path = Component(output->length(), 1); + output->push_back('/'); + } else { + // No path at all + new_parsed->path.reset(); + } + + // Query + CanonicalizeQuery(source.query, parsed.query, query_converter, + output, &new_parsed->query); + + // Ref: ignore failure for this, since the page can probably still be loaded. + CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref); + + return success; +} + +} // namespace + + +// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED +// if the scheme is unknown. +int DefaultPortForScheme(const char* scheme, int scheme_len) { + int default_port = PORT_UNSPECIFIED; + switch (scheme_len) { + case 4: + if (!strncmp(scheme, kHttpScheme, scheme_len)) + default_port = 80; + break; + case 5: + if (!strncmp(scheme, kHttpsScheme, scheme_len)) + default_port = 443; + break; + case 3: + if (!strncmp(scheme, kFtpScheme, scheme_len)) + default_port = 21; + else if (!strncmp(scheme, kWssScheme, scheme_len)) + default_port = 443; + break; + case 6: + if (!strncmp(scheme, kGopherScheme, scheme_len)) + default_port = 70; + break; + case 2: + if (!strncmp(scheme, kWsScheme, scheme_len)) + default_port = 80; + break; + } + return default_port; +} + +bool CanonicalizeStandardURL(const char* spec, + int spec_len, + const Parsed& parsed, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeStandardURL<char, unsigned char>( + URLComponentSource<char>(spec), parsed, scheme_type, query_converter, + output, new_parsed); +} + +bool CanonicalizeStandardURL(const gurl_base::char16* spec, + int spec_len, + const Parsed& parsed, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeStandardURL<gurl_base::char16, gurl_base::char16>( + URLComponentSource<gurl_base::char16>(spec), parsed, scheme_type, + query_converter, output, new_parsed); +} + +// It might be nice in the future to optimize this so unchanged components don't +// need to be recanonicalized. This is especially true since the common case for +// ReplaceComponents is removing things we don't want, like reference fragments +// and usernames. These cases can become more efficient if we can assume the +// rest of the URL is OK with these removed (or only the modified parts +// recanonicalized). This would be much more complex to implement, however. +// +// You would also need to update DoReplaceComponents in url_util.cc which +// relies on this re-checking everything (see the comment there for why). +bool ReplaceStandardURL(const char* base, + const Parsed& base_parsed, + const Replacements<char>& replacements, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + URLComponentSource<char> source(base); + Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizeStandardURL<char, unsigned char>( + source, parsed, scheme_type, query_converter, output, new_parsed); +} + +// For 16-bit replacements, we turn all the replacements into UTF-8 so the +// regular code path can be used. +bool ReplaceStandardURL(const char* base, + const Parsed& base_parsed, + const Replacements<gurl_base::char16>& replacements, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource<char> source(base); + Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizeStandardURL<char, unsigned char>( + source, parsed, scheme_type, query_converter, output, new_parsed); +} + +} // namespace url
diff --git a/url/url_canon_unittest.cc b/url/url_canon_unittest.cc new file mode 100644 index 0000000..9d1a458 --- /dev/null +++ b/url/url_canon_unittest.cc
@@ -0,0 +1,2396 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <errno.h> +#include <stddef.h> + +#include "base/stl_util.h" +#include "base/strings/utf_string_conversions.h" +#include "base/test/gtest_util.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon.h" +#include "url/url_canon_internal.h" +#include "url/url_canon_stdstring.h" +#include "url/url_test_utils.h" + +namespace url { + +namespace { + +struct ComponentCase { + const char* input; + const char* expected; + Component expected_component; + bool expected_success; +}; + +// ComponentCase but with dual 8-bit/16-bit input. Generally, the unit tests +// treat each input as optional, and will only try processing if non-NULL. +// The output is always 8-bit. +struct DualComponentCase { + const char* input8; + const wchar_t* input16; + const char* expected; + Component expected_component; + bool expected_success; +}; + +// Test cases for CanonicalizeIPAddress(). The inputs are identical to +// DualComponentCase, but the output has extra CanonHostInfo fields. +struct IPAddressCase { + const char* input8; + const wchar_t* input16; + const char* expected; + Component expected_component; + + // CanonHostInfo fields, for verbose output. + CanonHostInfo::Family expected_family; + int expected_num_ipv4_components; + const char* expected_address_hex; // Two hex chars per IP address byte. +}; + +std::string BytesToHexString(unsigned char bytes[16], int length) { + EXPECT_TRUE(length == 0 || length == 4 || length == 16) + << "Bad IP address length: " << length; + std::string result; + for (int i = 0; i < length; ++i) { + result.push_back(kHexCharLookup[(bytes[i] >> 4) & 0xf]); + result.push_back(kHexCharLookup[bytes[i] & 0xf]); + } + return result; +} + +struct ReplaceCase { + const char* base; + const char* scheme; + const char* username; + const char* password; + const char* host; + const char* port; + const char* path; + const char* query; + const char* ref; + const char* expected; +}; + +// Magic string used in the replacements code that tells SetupReplComp to +// call the clear function. +const char kDeleteComp[] = "|"; + +// Sets up a replacement for a single component. This is given pointers to +// the set and clear function for the component being replaced, and will +// either set the component (if it exists) or clear it (if the replacement +// string matches kDeleteComp). +// +// This template is currently used only for the 8-bit case, and the strlen +// causes it to fail in other cases. It is left a template in case we have +// tests for wide replacements. +template<typename CHAR> +void SetupReplComp( + void (Replacements<CHAR>::*set)(const CHAR*, const Component&), + void (Replacements<CHAR>::*clear)(), + Replacements<CHAR>* rep, + const CHAR* str) { + if (str && str[0] == kDeleteComp[0]) { + (rep->*clear)(); + } else if (str) { + (rep->*set)(str, Component(0, static_cast<int>(strlen(str)))); + } +} + +} // namespace + +TEST(URLCanonTest, DoAppendUTF8) { + struct UTF8Case { + unsigned input; + const char* output; + } utf_cases[] = { + // Valid code points. + {0x24, "\x24"}, + {0xA2, "\xC2\xA2"}, + {0x20AC, "\xE2\x82\xAC"}, + {0x24B62, "\xF0\xA4\xAD\xA2"}, + {0x10FFFF, "\xF4\x8F\xBF\xBF"}, + }; + std::string out_str; + for (size_t i = 0; i < gurl_base::size(utf_cases); i++) { + out_str.clear(); + StdStringCanonOutput output(&out_str); + AppendUTF8Value(utf_cases[i].input, &output); + output.Complete(); + EXPECT_EQ(utf_cases[i].output, out_str); + } +} + +TEST(URLCanonTest, DoAppendUTF8Invalid) { + std::string out_str; + StdStringCanonOutput output(&out_str); + // Invalid code point (too large). + EXPECT_DCHECK_DEATH({ + AppendUTF8Value(0x110000, &output); + output.Complete(); + }); +} + +TEST(URLCanonTest, UTF) { + // Low-level test that we handle reading, canonicalization, and writing + // UTF-8/UTF-16 strings properly. + struct UTFCase { + const char* input8; + const wchar_t* input16; + bool expected_success; + const char* output; + } utf_cases[] = { + // Valid canonical input should get passed through & escaped. + {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"}, + // Test a character that takes > 16 bits (U+10300 = old italic letter A) + {"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"}, + // Non-shortest-form UTF-8 characters are invalid. The bad bytes should + // each be replaced with the invalid character (EF BF DB in UTF-8). + {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", NULL, false, + "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%E5%A5%BD"}, + // Invalid UTF-8 sequences should be marked as invalid (the first + // sequence is truncated). + {"\xe4\xa0\xe5\xa5\xbd", L"\xd800\x597d", false, "%EF%BF%BD%E5%A5%BD"}, + // Character going off the end. + {"\xe4\xbd\xa0\xe5\xa5", L"\x4f60\xd800", false, "%E4%BD%A0%EF%BF%BD"}, + // ...same with low surrogates with no high surrogate. + {nullptr, L"\xdc00", false, "%EF%BF%BD"}, + // Test a UTF-8 encoded surrogate value is marked as invalid. + // ED A0 80 = U+D800 + {"\xed\xa0\x80", NULL, false, "%EF%BF%BD%EF%BF%BD%EF%BF%BD"}, + // ...even when paired. + {"\xed\xa0\x80\xed\xb0\x80", nullptr, false, + "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD"}, + }; + + std::string out_str; + for (size_t i = 0; i < gurl_base::size(utf_cases); i++) { + if (utf_cases[i].input8) { + out_str.clear(); + StdStringCanonOutput output(&out_str); + + int input_len = static_cast<int>(strlen(utf_cases[i].input8)); + bool success = true; + for (int ch = 0; ch < input_len; ch++) { + success &= AppendUTF8EscapedChar(utf_cases[i].input8, &ch, input_len, + &output); + } + output.Complete(); + EXPECT_EQ(utf_cases[i].expected_success, success); + EXPECT_EQ(std::string(utf_cases[i].output), out_str); + } + if (utf_cases[i].input16) { + out_str.clear(); + StdStringCanonOutput output(&out_str); + + gurl_base::string16 input_str( + test_utils::TruncateWStringToUTF16(utf_cases[i].input16)); + int input_len = static_cast<int>(input_str.length()); + bool success = true; + for (int ch = 0; ch < input_len; ch++) { + success &= AppendUTF8EscapedChar(input_str.c_str(), &ch, input_len, + &output); + } + output.Complete(); + EXPECT_EQ(utf_cases[i].expected_success, success); + EXPECT_EQ(std::string(utf_cases[i].output), out_str); + } + + if (utf_cases[i].input8 && utf_cases[i].input16 && + utf_cases[i].expected_success) { + // Check that the UTF-8 and UTF-16 inputs are equivalent. + + // UTF-16 -> UTF-8 + std::string input8_str(utf_cases[i].input8); + gurl_base::string16 input16_str( + test_utils::TruncateWStringToUTF16(utf_cases[i].input16)); + EXPECT_EQ(input8_str, gurl_base::UTF16ToUTF8(input16_str)); + + // UTF-8 -> UTF-16 + EXPECT_EQ(input16_str, gurl_base::UTF8ToUTF16(input8_str)); + } + } +} + +TEST(URLCanonTest, Scheme) { + // Here, we're mostly testing that unusual characters are handled properly. + // The canonicalizer doesn't do any parsing or whitespace detection. It will + // also do its best on error, and will escape funny sequences (these won't be + // valid schemes and it will return error). + // + // Note that the canonicalizer will append a colon to the output to separate + // out the rest of the URL, which is not present in the input. We check, + // however, that the output range includes everything but the colon. + ComponentCase scheme_cases[] = { + {"http", "http:", Component(0, 4), true}, + {"HTTP", "http:", Component(0, 4), true}, + {" HTTP ", "%20http%20:", Component(0, 10), false}, + {"htt: ", "htt%3A%20:", Component(0, 9), false}, + {"\xe4\xbd\xa0\xe5\xa5\xbdhttp", "%E4%BD%A0%E5%A5%BDhttp:", Component(0, 22), false}, + // Don't re-escape something already escaped. Note that it will + // "canonicalize" the 'A' to 'a', but that's OK. + {"ht%3Atp", "ht%3atp:", Component(0, 7), false}, + {"", ":", Component(0, 0), false}, + }; + + std::string out_str; + + for (size_t i = 0; i < gurl_base::size(scheme_cases); i++) { + int url_len = static_cast<int>(strlen(scheme_cases[i].input)); + Component in_comp(0, url_len); + Component out_comp; + + out_str.clear(); + StdStringCanonOutput output1(&out_str); + bool success = CanonicalizeScheme(scheme_cases[i].input, in_comp, &output1, + &out_comp); + output1.Complete(); + + EXPECT_EQ(scheme_cases[i].expected_success, success); + EXPECT_EQ(std::string(scheme_cases[i].expected), out_str); + EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len); + + // Now try the wide version. + out_str.clear(); + StdStringCanonOutput output2(&out_str); + + gurl_base::string16 wide_input(gurl_base::UTF8ToUTF16(scheme_cases[i].input)); + in_comp.len = static_cast<int>(wide_input.length()); + success = CanonicalizeScheme(wide_input.c_str(), in_comp, &output2, + &out_comp); + output2.Complete(); + + EXPECT_EQ(scheme_cases[i].expected_success, success); + EXPECT_EQ(std::string(scheme_cases[i].expected), out_str); + EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len); + } + + // Test the case where the scheme is declared nonexistent, it should be + // converted into an empty scheme. + Component out_comp; + out_str.clear(); + StdStringCanonOutput output(&out_str); + + EXPECT_FALSE(CanonicalizeScheme("", Component(0, -1), &output, &out_comp)); + output.Complete(); + + EXPECT_EQ(std::string(":"), out_str); + EXPECT_EQ(0, out_comp.begin); + EXPECT_EQ(0, out_comp.len); +} + +TEST(URLCanonTest, Host) { + IPAddressCase host_cases[] = { + // Basic canonicalization, uppercase should be converted to lowercase. + {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""}, + // Spaces and some other characters should be escaped. + {"Goo%20 goo%7C|.com", L"Goo%20 goo%7C|.com", "goo%20%20goo%7C%7C.com", Component(0, 22), CanonHostInfo::NEUTRAL, -1, ""}, + // Exciting different types of spaces! + {NULL, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", Component(0, 16), CanonHostInfo::NEUTRAL, -1, ""}, + // Other types of space (no-break, zero-width, zero-width-no-break) are + // name-prepped away to nothing. + {NULL, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""}, + // Ideographic full stop (full-width period for Chinese, etc.) should be + // treated as a dot. + {NULL, L"www.foo\x3002" L"bar.com", "www.foo.bar.com", Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""}, + // Invalid unicode characters should fail... + // ...In wide input, ICU will barf and we'll end up with the input as + // escaped UTF-8 (the invalid character should be replaced with the + // replacement character). + {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%BF%BDzyx.com", Component(0, 16), CanonHostInfo::BROKEN, -1, ""}, + // ...This is the same as previous but with with escaped. + {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%BF%BDzyx.com", Component(0, 16), CanonHostInfo::BROKEN, -1, ""}, + // Test name prepping, fullwidth input should be converted to ASCII and NOT + // IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16. + {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com", Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""}, + // Test that fullwidth escaped values are properly name-prepped, + // then converted or rejected. + // ...%41 in fullwidth = 'A' (also as escaped UTF-8 input) + {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""}, + {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""}, + // ...%00 in fullwidth should fail (also as escaped UTF-8 input) + {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""}, + {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""}, + // ICU will convert weird percents into ASCII percents, but not unescape + // further. A weird percent is U+FE6A (EF B9 AA in UTF-8) which is a + // "small percent". At this point we should be within our rights to mark + // anything as invalid since the URL is corrupt or malicious. The code + // happens to allow ASCII characters (%41 = "A" -> 'a') to be unescaped + // and kept as valid, so we validate that behavior here, but this level + // of fixing the input shouldn't be seen as required. "%81" is invalid. + {"\xef\xb9\xaa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""}, + {"%ef%b9%aa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""}, + {"\xef\xb9\xaa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""}, + {"%ef%b9%aa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""}, + // Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN + {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""}, + // See http://unicode.org/cldr/utility/idna.jsp for other + // examples/experiments and http://goo.gl/7yG11o + // for the full list of characters handled differently by + // IDNA 2003, UTS 46 (http://unicode.org/reports/tr46/ ) and IDNA 2008. + + // 4 Deviation characters are mapped/ignored in UTS 46 transitional + // mechansm. UTS 46, table 4 row (g). + // Sharp-s is mapped to 'ss' in UTS 46 and IDNA 2003. + // Otherwise, it'd be "xn--fuball-cta.de". + {"fu\xc3\x9f" "ball.de", L"fu\x00df" L"ball.de", "fussball.de", + Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""}, + // Final-sigma (U+03C3) is mapped to regular sigma (U+03C2). + // Otherwise, it'd be "xn--wxaijb9b". + {"\xcf\x83\xcf\x8c\xce\xbb\xce\xbf\xcf\x82", L"\x3c3\x3cc\x3bb\x3bf\x3c2", + "xn--wxaikc6b", Component(0, 12), + CanonHostInfo::NEUTRAL, -1, ""}, + // ZWNJ (U+200C) and ZWJ (U+200D) are mapped away in UTS 46 transitional + // handling as well as in IDNA 2003. + {"a\xe2\x80\x8c" "b\xe2\x80\x8d" "c", L"a\x200c" L"b\x200d" L"c", "abc", + Component(0, 3), CanonHostInfo::NEUTRAL, -1, ""}, + // ZWJ between Devanagari characters is still mapped away in UTS 46 + // transitional handling. IDNA 2008 would give xn--11bo0mv54g. + {"\xe0\xa4\x95\xe0\xa5\x8d\xe2\x80\x8d\xe0\xa4\x9c", + L"\x915\x94d\x200d\x91c", "xn--11bo0m", + Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""}, + // Fullwidth exclamation mark is disallowed. UTS 46, table 4, row (b) + // However, we do allow this at the moment because we don't use + // STD3 rules and canonicalize full-width ASCII to ASCII. + {"wow\xef\xbc\x81", L"wow\xff01", "wow%21", + Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""}, + // U+2132 (turned capital F) is disallowed. UTS 46, table 4, row (c) + // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2 + {"\xe2\x84\xb2oo", L"\x2132oo", "%E2%84%B2oo", + Component(0, 11), CanonHostInfo::BROKEN, -1, ""}, + // U+2F868 (CJK Comp) is disallowed. UTS 46, table 4, row (d) + // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2 + {"\xf0\xaf\xa1\xa8\xe5\xa7\xbb.cn", L"\xd87e\xdc68\x59fb.cn", + "%F0%AF%A1%A8%E5%A7%BB.cn", + Component(0, 24), CanonHostInfo::BROKEN, -1, ""}, + // Maps uppercase letters to lower case letters. UTS 46 table 4 row (e) + {"M\xc3\x9cNCHEN", L"M\xdcNCHEN", "xn--mnchen-3ya", + Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""}, + // An already-IDNA host is not modified. + {"xn--mnchen-3ya", L"xn--mnchen-3ya", "xn--mnchen-3ya", + Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""}, + // Symbol/punctuations are allowed in IDNA 2003/UTS46. + // Not allowed in IDNA 2008. UTS 46 table 4 row (f). + {"\xe2\x99\xa5ny.us", L"\x2665ny.us", "xn--ny-s0x.us", + Component(0, 13), CanonHostInfo::NEUTRAL, -1, ""}, + // U+11013 is new in Unicode 6.0 and is allowed. UTS 46 table 4, row (h) + // We used to allow it because we passed through unassigned code points. + {"\xf0\x91\x80\x93.com", L"\xd804\xdc13.com", "xn--n00d.com", + Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""}, + // U+0602 is disallowed in UTS46/IDNA 2008. UTS 46 table 4, row(i) + // Used to be allowed in INDA 2003. + {"\xd8\x82.eg", L"\x602.eg", "%D8%82.eg", + Component(0, 9), CanonHostInfo::BROKEN, -1, ""}, + // U+20B7 is new in Unicode 5.2 (not a part of IDNA 2003 based + // on Unicode 3.2). We did allow it in the past because we let unassigned + // code point pass. We continue to allow it even though it's a + // "punctuation and symbol" blocked in IDNA 2008. + // UTS 46 table 4, row (j) + {"\xe2\x82\xb7.com", L"\x20b7.com", "xn--wzg.com", + Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""}, + // Maps uppercase letters to lower case letters. + // In IDNA 2003, it's allowed without case-folding + // ( xn--bc-7cb.com ) because it's not defined in Unicode 3.2 + // (added in Unicode 4.1). UTS 46 table 4 row (k) + {"bc\xc8\xba.com", L"bc\x23a.com", "xn--bc-is1a.com", + Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""}, + // Maps U+FF43 (Full Width Small Letter C) to 'c'. + {"ab\xef\xbd\x83.xyz", L"ab\xff43.xyz", "abc.xyz", + Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""}, + // Maps U+1D68C (Math Monospace Small C) to 'c'. + // U+1D68C = \xD835\xDE8C in UTF-16 + {"ab\xf0\x9d\x9a\x8c.xyz", L"ab\xd835\xde8c.xyz", "abc.xyz", + Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""}, + // BiDi check test + // "Divehi" in Divehi (Thaana script) ends with BidiClass=NSM. + // Disallowed in IDNA 2003 but now allowed in UTS 46/IDNA 2008. + {"\xde\x8b\xde\xa8\xde\x88\xde\xac\xde\x80\xde\xa8", + L"\x78b\x7a8\x788\x7ac\x780\x7a8", "xn--hqbpi0jcw", + Component(0, 13), CanonHostInfo::NEUTRAL, -1, ""}, + // Disallowed in both IDNA 2003 and 2008 with BiDi check. + // Labels starting with a RTL character cannot end with a LTR character. + {"\xd8\xac\xd8\xa7\xd8\xb1xyz", L"\x62c\x627\x631xyz", + "%D8%AC%D8%A7%D8%B1xyz", Component(0, 21), + CanonHostInfo::BROKEN, -1, ""}, + // Labels starting with a RTL character can end with BC=EN (European + // number). Disallowed in IDNA 2003 but now allowed. + {"\xd8\xac\xd8\xa7\xd8\xb1" "2", L"\x62c\x627\x631" L"2", + "xn--2-ymcov", Component(0, 11), + CanonHostInfo::NEUTRAL, -1, ""}, + // Labels starting with a RTL character cannot have "L" characters + // even if it ends with an BC=EN. Disallowed in both IDNA 2003/2008. + {"\xd8\xac\xd8\xa7\xd8\xb1xy2", L"\x62c\x627\x631xy2", + "%D8%AC%D8%A7%D8%B1xy2", Component(0, 21), + CanonHostInfo::BROKEN, -1, ""}, + // Labels starting with a RTL character can end with BC=AN (Arabic number) + // Disallowed in IDNA 2003, but now allowed. + {"\xd8\xac\xd8\xa7\xd8\xb1\xd9\xa2", L"\x62c\x627\x631\x662", + "xn--mgbjq0r", Component(0, 11), + CanonHostInfo::NEUTRAL, -1, ""}, + // Labels starting with a RTL character cannot have "L" characters + // even if it ends with an BC=AN (Arabic number). + // Disallowed in both IDNA 2003/2008. + {"\xd8\xac\xd8\xa7\xd8\xb1xy\xd9\xa2", L"\x62c\x627\x631xy\x662", + "%D8%AC%D8%A7%D8%B1xy%D9%A2", Component(0, 26), + CanonHostInfo::BROKEN, -1, ""}, + // Labels starting with a RTL character cannot mix BC=EN and BC=AN + {"\xd8\xac\xd8\xa7\xd8\xb1xy2\xd9\xa2", L"\x62c\x627\x631xy2\x662", + "%D8%AC%D8%A7%D8%B1xy2%D9%A2", Component(0, 27), + CanonHostInfo::BROKEN, -1, ""}, + // As of Unicode 6.2, U+20CF is not assigned. We do not allow it. + {"\xe2\x83\x8f.com", L"\x20cf.com", "%E2%83%8F.com", + Component(0, 13), CanonHostInfo::BROKEN, -1, ""}, + // U+0080 is not allowed. + {"\xc2\x80.com", L"\x80.com", "%C2%80.com", + Component(0, 10), CanonHostInfo::BROKEN, -1, ""}, + // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped + // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped + // UTF-8 (wide case). The output should be equivalent to the true wide + // character input above). + {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd", + L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba", + Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""}, + // Invalid escaped characters should fail and the percents should be + // escaped. + {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", Component(0, 10), + CanonHostInfo::BROKEN, -1, ""}, + // If we get an invalid character that has been escaped. + {"%25", L"%25", "%25", Component(0, 3), + CanonHostInfo::BROKEN, -1, ""}, + {"hello%00", L"hello%00", "hello%00", Component(0, 8), + CanonHostInfo::BROKEN, -1, ""}, + // Escaped numbers should be treated like IP addresses if they are. + {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01", + "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, + "C0A80001"}, + {"%30%78%63%30%2e%30%32%35%30.01%2e", L"%30%78%63%30%2e%30%32%35%30.01%2e", + "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, + "C0A80001"}, + // Invalid escaping should trigger the regular host error handling. + {"%3g%78%63%30%2e%30%32%35%30%2E.01", L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01", Component(0, 17), CanonHostInfo::BROKEN, -1, ""}, + // Something that isn't exactly an IP should get treated as a host and + // spaces escaped. + {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello", Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""}, + // Fullwidth and escaped UTF-8 fullwidth should still be treated as IP. + // These are "0Xc0.0250.01" in fullwidth. + {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%8E\xef\xbc\x90\xef\xbc\x91", L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10\xff11", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"}, + // Broken IP addresses get marked as such. + {"192.168.0.257", L"192.168.0.257", "192.168.0.257", Component(0, 13), CanonHostInfo::BROKEN, -1, ""}, + {"[google.com]", L"[google.com]", "[google.com]", Component(0, 12), CanonHostInfo::BROKEN, -1, ""}, + // Cyrillic letter followed by '(' should return punycode for '(' escaped + // before punycode string was created. I.e. + // if '(' is escaped after punycode is created we would get xn--%28-8tb + // (incorrect). + {"\xd1\x82(", L"\x0442(", "xn--%28-7ed", Component(0, 11), + CanonHostInfo::NEUTRAL, -1, ""}, + // Address with all hexidecimal characters with leading number of 1<<32 + // or greater and should return NEUTRAL rather than BROKEN if not all + // components are numbers. + {"12345678912345.de", L"12345678912345.de", "12345678912345.de", Component(0, 17), CanonHostInfo::NEUTRAL, -1, ""}, + {"1.12345678912345.de", L"1.12345678912345.de", "1.12345678912345.de", Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""}, + {"12345678912345.12345678912345.de", L"12345678912345.12345678912345.de", "12345678912345.12345678912345.de", Component(0, 32), CanonHostInfo::NEUTRAL, -1, ""}, + {"1.2.0xB3A73CE5B59.de", L"1.2.0xB3A73CE5B59.de", "1.2.0xb3a73ce5b59.de", Component(0, 20), CanonHostInfo::NEUTRAL, -1, ""}, + {"12345678912345.0xde", L"12345678912345.0xde", "12345678912345.0xde", Component(0, 19), CanonHostInfo::BROKEN, -1, ""}, + // A label that starts with "xn--" but contains non-ASCII characters should + // be an error. Escape the invalid characters. + {"xn--m\xc3\xbcnchen", L"xn--m\xfcnchen", "xn--m%C3%BCnchen", Component(0, 16), CanonHostInfo::BROKEN, -1, ""}, + }; + + // CanonicalizeHost() non-verbose. + std::string out_str; + for (size_t i = 0; i < gurl_base::size(host_cases); i++) { + // Narrow version. + if (host_cases[i].input8) { + int host_len = static_cast<int>(strlen(host_cases[i].input8)); + Component in_comp(0, host_len); + Component out_comp; + + out_str.clear(); + StdStringCanonOutput output(&out_str); + + bool success = CanonicalizeHost(host_cases[i].input8, in_comp, &output, + &out_comp); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN, + success) << "for input: " << host_cases[i].input8; + EXPECT_EQ(std::string(host_cases[i].expected), out_str) << + "for input: " << host_cases[i].input8; + EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin) << + "for input: " << host_cases[i].input8; + EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len) << + "for input: " << host_cases[i].input8; + } + + // Wide version. + if (host_cases[i].input16) { + gurl_base::string16 input16( + test_utils::TruncateWStringToUTF16(host_cases[i].input16)); + int host_len = static_cast<int>(input16.length()); + Component in_comp(0, host_len); + Component out_comp; + + out_str.clear(); + StdStringCanonOutput output(&out_str); + + bool success = CanonicalizeHost(input16.c_str(), in_comp, &output, + &out_comp); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN, + success); + EXPECT_EQ(std::string(host_cases[i].expected), out_str); + EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len); + } + } + + // CanonicalizeHostVerbose() + for (size_t i = 0; i < gurl_base::size(host_cases); i++) { + // Narrow version. + if (host_cases[i].input8) { + int host_len = static_cast<int>(strlen(host_cases[i].input8)); + Component in_comp(0, host_len); + + out_str.clear(); + StdStringCanonOutput output(&out_str); + CanonHostInfo host_info; + + CanonicalizeHostVerbose(host_cases[i].input8, in_comp, &output, + &host_info); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family, host_info.family); + EXPECT_EQ(std::string(host_cases[i].expected), out_str); + EXPECT_EQ(host_cases[i].expected_component.begin, + host_info.out_host.begin); + EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len); + EXPECT_EQ(std::string(host_cases[i].expected_address_hex), + BytesToHexString(host_info.address, host_info.AddressLength())); + if (host_cases[i].expected_family == CanonHostInfo::IPV4) { + EXPECT_EQ(host_cases[i].expected_num_ipv4_components, + host_info.num_ipv4_components); + } + } + + // Wide version. + if (host_cases[i].input16) { + gurl_base::string16 input16( + test_utils::TruncateWStringToUTF16(host_cases[i].input16)); + int host_len = static_cast<int>(input16.length()); + Component in_comp(0, host_len); + + out_str.clear(); + StdStringCanonOutput output(&out_str); + CanonHostInfo host_info; + + CanonicalizeHostVerbose(input16.c_str(), in_comp, &output, &host_info); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family, host_info.family); + EXPECT_EQ(std::string(host_cases[i].expected), out_str); + EXPECT_EQ(host_cases[i].expected_component.begin, + host_info.out_host.begin); + EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len); + EXPECT_EQ(std::string(host_cases[i].expected_address_hex), + BytesToHexString(host_info.address, host_info.AddressLength())); + if (host_cases[i].expected_family == CanonHostInfo::IPV4) { + EXPECT_EQ(host_cases[i].expected_num_ipv4_components, + host_info.num_ipv4_components); + } + } + } +} + +TEST(URLCanonTest, IPv4) { + IPAddressCase cases[] = { + // Empty is not an IP address. + {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + {".", L".", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Regular IP addresses in different bases. + {"192.168.0.1", L"192.168.0.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"}, + {"0300.0250.00.01", L"0300.0250.00.01", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"}, + {"0xC0.0Xa8.0x0.0x1", L"0xC0.0Xa8.0x0.0x1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"}, + // Non-IP addresses due to invalid characters. + {"192.168.9.com", L"192.168.9.com", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Invalid characters for the base should be rejected. + {"19a.168.0.1", L"19a.168.0.1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + {"0308.0250.00.01", L"0308.0250.00.01", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + {"0xCG.0xA8.0x0.0x1", L"0xCG.0xA8.0x0.0x1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // If there are not enough components, the last one should fill them out. + {"192", L"192", "0.0.0.192", Component(0, 9), CanonHostInfo::IPV4, 1, "000000C0"}, + {"0xC0a80001", L"0xC0a80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"}, + {"030052000001", L"030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"}, + {"000030052000001", L"000030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"}, + {"192.168", L"192.168", "192.0.0.168", Component(0, 11), CanonHostInfo::IPV4, 2, "C00000A8"}, + {"192.0x00A80001", L"192.0x000A80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"}, + {"0xc0.052000001", L"0xc0.052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"}, + {"192.168.1", L"192.168.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"}, + // Too many components means not an IP address. + {"192.168.0.0.1", L"192.168.0.0.1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // We allow a single trailing dot. + {"192.168.0.1.", L"192.168.0.1.", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"}, + {"192.168.0.1. hello", L"192.168.0.1. hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + {"192.168.0.1..", L"192.168.0.1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Two dots in a row means not an IP address. + {"192.168..1", L"192.168..1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Any numerical overflow should be marked as BROKEN. + {"0x100.0", L"0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0x100.0.0", L"0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0x100.0.0.0", L"0x100.0.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0.0x100.0.0", L"0.0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0.0.0x100.0", L"0.0.0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0.0.0.0x100", L"0.0.0.0x100", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0.0.0x10000", L"0.0.0x10000", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0.0x1000000", L"0.0x1000000", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0x100000000", L"0x100000000", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Repeat the previous tests, minus 1, to verify boundaries. + {"0xFF.0", L"0xFF.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 2, "FF000000"}, + {"0xFF.0.0", L"0xFF.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 3, "FF000000"}, + {"0xFF.0.0.0", L"0xFF.0.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "FF000000"}, + {"0.0xFF.0.0", L"0.0xFF.0.0", "0.255.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "00FF0000"}, + {"0.0.0xFF.0", L"0.0.0xFF.0", "0.0.255.0", Component(0, 9), CanonHostInfo::IPV4, 4, "0000FF00"}, + {"0.0.0.0xFF", L"0.0.0.0xFF", "0.0.0.255", Component(0, 9), CanonHostInfo::IPV4, 4, "000000FF"}, + {"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", Component(0, 11), CanonHostInfo::IPV4, 3, "0000FFFF"}, + {"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", Component(0, 13), CanonHostInfo::IPV4, 2, "00FFFFFF"}, + {"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", Component(0, 15), CanonHostInfo::IPV4, 1, "FFFFFFFF"}, + // Old trunctations tests. They're all "BROKEN" now. + {"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"192.168.0.257", L"192.168.0.257", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"192.168.0xa20001", L"192.168.0xa20001", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"192.015052000001", L"192.015052000001", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0X12C0a80001", L"0X12C0a80001", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"276.1.2", L"276.1.2", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Spaces should be rejected. + {"192.168.0.1 hello", L"192.168.0.1 hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Very large numbers. + {"0000000000000300.0x00000000000000fF.00000000000000001", L"0000000000000300.0x00000000000000fF.00000000000000001", "192.255.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0FF0001"}, + {"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", L"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", "", Component(0, 11), CanonHostInfo::BROKEN, -1, ""}, + // A number has no length limit, but long numbers can still overflow. + {"00000000000000000001", L"00000000000000000001", "0.0.0.1", Component(0, 7), CanonHostInfo::IPV4, 1, "00000001"}, + {"0000000000000000100000000000000001", L"0000000000000000100000000000000001", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // If a long component is non-numeric, it's a hostname, *not* a broken IP. + {"0.0.0.000000000000000000z", L"0.0.0.000000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + {"0.0.0.100000000000000000z", L"0.0.0.100000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Truncation of all zeros should still result in 0. + {"0.00.0x.0x0", L"0.00.0x.0x0", "0.0.0.0", Component(0, 7), CanonHostInfo::IPV4, 4, "00000000"}, + }; + + for (size_t i = 0; i < gurl_base::size(cases); i++) { + // 8-bit version. + Component component(0, static_cast<int>(strlen(cases[i].input8))); + + std::string out_str1; + StdStringCanonOutput output1(&out_str1); + CanonHostInfo host_info; + CanonicalizeIPAddress(cases[i].input8, component, &output1, &host_info); + output1.Complete(); + + EXPECT_EQ(cases[i].expected_family, host_info.family); + EXPECT_EQ(std::string(cases[i].expected_address_hex), + BytesToHexString(host_info.address, host_info.AddressLength())); + if (host_info.family == CanonHostInfo::IPV4) { + EXPECT_STREQ(cases[i].expected, out_str1.c_str()); + EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin); + EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); + EXPECT_EQ(cases[i].expected_num_ipv4_components, + host_info.num_ipv4_components); + } + + // 16-bit version. + gurl_base::string16 input16( + test_utils::TruncateWStringToUTF16(cases[i].input16)); + component = Component(0, static_cast<int>(input16.length())); + + std::string out_str2; + StdStringCanonOutput output2(&out_str2); + CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info); + output2.Complete(); + + EXPECT_EQ(cases[i].expected_family, host_info.family); + EXPECT_EQ(std::string(cases[i].expected_address_hex), + BytesToHexString(host_info.address, host_info.AddressLength())); + if (host_info.family == CanonHostInfo::IPV4) { + EXPECT_STREQ(cases[i].expected, out_str2.c_str()); + EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin); + EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); + EXPECT_EQ(cases[i].expected_num_ipv4_components, + host_info.num_ipv4_components); + } + } +} + +TEST(URLCanonTest, IPv6) { + IPAddressCase cases[] = { + // Empty is not an IP address. + {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Non-IPs with [:] characters are marked BROKEN. + {":", L":", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[", L"[", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[:", L"[:", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"]", L"]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {":]", L":]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[]", L"[]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[:]", L"[:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Regular IP address is invalid without bounding '[' and ']'. + {"2001:db8::1", L"2001:db8::1", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[2001:db8::1", L"[2001:db8::1", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"2001:db8::1]", L"2001:db8::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Regular IP addresses. + {"[::]", L"[::]", "[::]", Component(0,4), CanonHostInfo::IPV6, -1, "00000000000000000000000000000000"}, + {"[::1]", L"[::1]", "[::1]", Component(0,5), CanonHostInfo::IPV6, -1, "00000000000000000000000000000001"}, + {"[1::]", L"[1::]", "[1::]", Component(0,5), CanonHostInfo::IPV6, -1, "00010000000000000000000000000000"}, + + // Leading zeros should be stripped. + {"[000:01:02:003:004:5:6:007]", L"[000:01:02:003:004:5:6:007]", "[0:1:2:3:4:5:6:7]", Component(0,17), CanonHostInfo::IPV6, -1, "00000001000200030004000500060007"}, + + // Upper case letters should be lowercased. + {"[A:b:c:DE:fF:0:1:aC]", L"[A:b:c:DE:fF:0:1:aC]", "[a:b:c:de:ff:0:1:ac]", Component(0,20), CanonHostInfo::IPV6, -1, "000A000B000C00DE00FF0000000100AC"}, + + // The same address can be written with different contractions, but should + // get canonicalized to the same thing. + {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", Component(0,14), CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"}, + {"[1::2:0:0:3:0]", L"[1::2:0:0:3:0]", "[1::2:0:0:3:0]", Component(0,14), CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"}, + + // Addresses with embedded IPv4. + {"[::192.168.0.1]", L"[::192.168.0.1]", "[::c0a8:1]", Component(0,10), CanonHostInfo::IPV6, -1, "000000000000000000000000C0A80001"}, + {"[::ffff:192.168.0.1]", L"[::ffff:192.168.0.1]", "[::ffff:c0a8:1]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0A80001"}, + {"[::eeee:192.168.0.1]", L"[::eeee:192.168.0.1]", "[::eeee:c0a8:1]", Component(0, 15), CanonHostInfo::IPV6, -1, "00000000000000000000EEEEC0A80001"}, + {"[2001::192.168.0.1]", L"[2001::192.168.0.1]", "[2001::c0a8:1]", Component(0, 14), CanonHostInfo::IPV6, -1, "200100000000000000000000C0A80001"}, + {"[1:2:192.168.0.1:5:6]", L"[1:2:192.168.0.1:5:6]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + + // IPv4 with last component missing. + {"[::ffff:192.1.2]", L"[::ffff:192.1.2]", "[::ffff:c001:2]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0010002"}, + + // IPv4 using hex. + // TODO(eroman): Should this format be disallowed? + {"[::ffff:0xC0.0Xa8.0x0.0x1]", L"[::ffff:0xC0.0Xa8.0x0.0x1]", "[::ffff:c0a8:1]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0A80001"}, + + // There may be zeros surrounding the "::" contraction. + {"[0:0::0:0:8]", L"[0:0::0:0:8]", "[::8]", Component(0,5), CanonHostInfo::IPV6, -1, "00000000000000000000000000000008"}, + + {"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", Component(0,13), CanonHostInfo::IPV6, -1, "20010DB8000000000000000000000001"}, + + // Can only have one "::" contraction in an IPv6 string literal. + {"[2001::db8::1]", L"[2001::db8::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // No more than 2 consecutive ':'s. + {"[2001:db8:::1]", L"[2001:db8:::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[:::]", L"[:::]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Non-IP addresses due to invalid characters. + {"[2001::.com]", L"[2001::.com]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // If there are not enough components, the last one should fill them out. + // ... omitted at this time ... + // Too many components means not an IP address. Similarly, with too few + // if using IPv4 compat or mapped addresses. + {"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Too many bits (even though 8 comonents, the last one holds 32 bits). + {"[0:0:0:0:0:0:0:192.168.0.1]", L"[0:0:0:0:0:0:0:192.168.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + + // Too many bits specified -- the contraction would have to be zero-length + // to not exceed 128 bits. + {"[1:2:3:4:5:6::192.168.0.1]", L"[1:2:3:4:5:6::192.168.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + + // The contraction is for 16 bits of zero. + {"[1:2:3:4:5:6::8]", L"[1:2:3:4:5:6::8]", "[1:2:3:4:5:6:0:8]", Component(0,17), CanonHostInfo::IPV6, -1, "00010002000300040005000600000008"}, + + // Cannot have a trailing colon. + {"[1:2:3:4:5:6:7:8:]", L"[1:2:3:4:5:6:7:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[1:2:3:4:5:6:192.168.0.1:]", L"[1:2:3:4:5:6:192.168.0.1:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + + // Cannot have negative numbers. + {"[-1:2:3:4:5:6:7:8]", L"[-1:2:3:4:5:6:7:8]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + + // Scope ID -- the URL may contain an optional ["%" <scope_id>] section. + // The scope_id should be included in the canonicalized URL, and is an + // unsigned decimal number. + + // Invalid because no ID was given after the percent. + + // Don't allow scope-id + {"[1::%1]", L"[1::%1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[1::%eth0]", L"[1::%eth0]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[1::%]", L"[1::%]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[%]", L"[%]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[::%:]", L"[::%:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + + // Don't allow leading or trailing colons. + {"[:0:0::0:0:8]", L"[:0:0::0:0:8]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[0:0::0:0:8:]", L"[0:0::0:0:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[:0:0::0:0:8:]", L"[:0:0::0:0:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + + // We allow a single trailing dot. + // ... omitted at this time ... + // Two dots in a row means not an IP address. + {"[::192.168..1]", L"[::192.168..1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Any non-first components get truncated to one byte. + // ... omitted at this time ... + // Spaces should be rejected. + {"[::1 hello]", L"[::1 hello]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + }; + + for (size_t i = 0; i < gurl_base::size(cases); i++) { + // 8-bit version. + Component component(0, static_cast<int>(strlen(cases[i].input8))); + + std::string out_str1; + StdStringCanonOutput output1(&out_str1); + CanonHostInfo host_info; + CanonicalizeIPAddress(cases[i].input8, component, &output1, &host_info); + output1.Complete(); + + EXPECT_EQ(cases[i].expected_family, host_info.family); + EXPECT_EQ(std::string(cases[i].expected_address_hex), + BytesToHexString(host_info.address, host_info.AddressLength())) << "iter " << i << " host " << cases[i].input8; + if (host_info.family == CanonHostInfo::IPV6) { + EXPECT_STREQ(cases[i].expected, out_str1.c_str()); + EXPECT_EQ(cases[i].expected_component.begin, + host_info.out_host.begin); + EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); + } + + // 16-bit version. + gurl_base::string16 input16( + test_utils::TruncateWStringToUTF16(cases[i].input16)); + component = Component(0, static_cast<int>(input16.length())); + + std::string out_str2; + StdStringCanonOutput output2(&out_str2); + CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info); + output2.Complete(); + + EXPECT_EQ(cases[i].expected_family, host_info.family); + EXPECT_EQ(std::string(cases[i].expected_address_hex), + BytesToHexString(host_info.address, host_info.AddressLength())); + if (host_info.family == CanonHostInfo::IPV6) { + EXPECT_STREQ(cases[i].expected, out_str2.c_str()); + EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin); + EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); + } + } +} + +TEST(URLCanonTest, IPEmpty) { + std::string out_str1; + StdStringCanonOutput output1(&out_str1); + CanonHostInfo host_info; + + // This tests tests. + const char spec[] = "192.168.0.1"; + CanonicalizeIPAddress(spec, Component(), &output1, &host_info); + EXPECT_FALSE(host_info.IsIPAddress()); + + CanonicalizeIPAddress(spec, Component(0, 0), &output1, &host_info); + EXPECT_FALSE(host_info.IsIPAddress()); +} + +// Verifies that CanonicalizeHostSubstring produces the expected output and +// does not "fix" IP addresses. Because this code is a subset of +// CanonicalizeHost, the shared functionality is not tested. +TEST(URLCanonTest, CanonicalizeHostSubstring) { + // Basic sanity check. + { + std::string out_str; + StdStringCanonOutput output(&out_str); + EXPECT_TRUE(CanonicalizeHostSubstring("M\xc3\x9cNCHEN.com", + Component(0, 12), &output)); + output.Complete(); + EXPECT_EQ("xn--mnchen-3ya.com", out_str); + } + + // Failure case. + { + std::string out_str; + StdStringCanonOutput output(&out_str); + EXPECT_FALSE(CanonicalizeHostSubstring( + test_utils::TruncateWStringToUTF16(L"\xfdd0zyx.com").c_str(), + Component(0, 8), &output)); + output.Complete(); + EXPECT_EQ("%EF%BF%BDzyx.com", out_str); + } + + // Should return true for empty input strings. + { + std::string out_str; + StdStringCanonOutput output(&out_str); + EXPECT_TRUE(CanonicalizeHostSubstring("", Component(0, 0), &output)); + output.Complete(); + EXPECT_EQ(std::string(), out_str); + } + + // Numbers that look like IP addresses should not be changed. + { + std::string out_str; + StdStringCanonOutput output(&out_str); + EXPECT_TRUE( + CanonicalizeHostSubstring("01.02.03.04", Component(0, 11), &output)); + output.Complete(); + EXPECT_EQ("01.02.03.04", out_str); + } +} + +TEST(URLCanonTest, UserInfo) { + // Note that the canonicalizer should escape and treat empty components as + // not being there. + + // We actually parse a full input URL so we can get the initial components. + struct UserComponentCase { + const char* input; + const char* expected; + Component expected_username; + Component expected_password; + bool expected_success; + } user_info_cases[] = { + {"http://user:pass@host.com/", "user:pass@", Component(0, 4), Component(5, 4), true}, + {"http://@host.com/", "", Component(0, -1), Component(0, -1), true}, + {"http://:@host.com/", "", Component(0, -1), Component(0, -1), true}, + {"http://foo:@host.com/", "foo@", Component(0, 3), Component(0, -1), true}, + {"http://:foo@host.com/", ":foo@", Component(0, 0), Component(1, 3), true}, + {"http://^ :$\t@host.com/", "%5E%20:$%09@", Component(0, 6), Component(7, 4), true}, + {"http://user:pass@/", "user:pass@", Component(0, 4), Component(5, 4), true}, + {"http://%2540:bar@domain.com/", "%2540:bar@", Component(0, 5), Component(6, 3), true }, + + // IE7 compatibility: old versions allowed backslashes in usernames, but + // IE7 does not. We disallow it as well. + {"ftp://me\\mydomain:pass@foo.com/", "", Component(0, -1), Component(0, -1), true}, + }; + + for (size_t i = 0; i < gurl_base::size(user_info_cases); i++) { + int url_len = static_cast<int>(strlen(user_info_cases[i].input)); + Parsed parsed; + ParseStandardURL(user_info_cases[i].input, url_len, &parsed); + Component out_user, out_pass; + std::string out_str; + StdStringCanonOutput output1(&out_str); + + bool success = CanonicalizeUserInfo(user_info_cases[i].input, + parsed.username, + user_info_cases[i].input, + parsed.password, + &output1, + &out_user, + &out_pass); + output1.Complete(); + + EXPECT_EQ(user_info_cases[i].expected_success, success); + EXPECT_EQ(std::string(user_info_cases[i].expected), out_str); + EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin); + EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len); + EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin); + EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len); + + // Now try the wide version + out_str.clear(); + StdStringCanonOutput output2(&out_str); + gurl_base::string16 wide_input(gurl_base::UTF8ToUTF16(user_info_cases[i].input)); + success = CanonicalizeUserInfo(wide_input.c_str(), + parsed.username, + wide_input.c_str(), + parsed.password, + &output2, + &out_user, + &out_pass); + output2.Complete(); + + EXPECT_EQ(user_info_cases[i].expected_success, success); + EXPECT_EQ(std::string(user_info_cases[i].expected), out_str); + EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin); + EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len); + EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin); + EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len); + } +} + +TEST(URLCanonTest, Port) { + // We only need to test that the number gets properly put into the output + // buffer. The parser unit tests will test scanning the number correctly. + // + // Note that the CanonicalizePort will always prepend a colon to the output + // to separate it from the colon that it assumes precedes it. + struct PortCase { + const char* input; + int default_port; + const char* expected; + Component expected_component; + bool expected_success; + } port_cases[] = { + // Invalid input should be copied w/ failure. + {"as df", 80, ":as%20df", Component(1, 7), false}, + {"-2", 80, ":-2", Component(1, 2), false}, + // Default port should be omitted. + {"80", 80, "", Component(0, -1), true}, + {"8080", 80, ":8080", Component(1, 4), true}, + // PORT_UNSPECIFIED should mean always keep the port. + {"80", PORT_UNSPECIFIED, ":80", Component(1, 2), true}, + }; + + for (size_t i = 0; i < gurl_base::size(port_cases); i++) { + int url_len = static_cast<int>(strlen(port_cases[i].input)); + Component in_comp(0, url_len); + Component out_comp; + std::string out_str; + StdStringCanonOutput output1(&out_str); + bool success = CanonicalizePort(port_cases[i].input, + in_comp, + port_cases[i].default_port, + &output1, + &out_comp); + output1.Complete(); + + EXPECT_EQ(port_cases[i].expected_success, success); + EXPECT_EQ(std::string(port_cases[i].expected), out_str); + EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len); + + // Now try the wide version + out_str.clear(); + StdStringCanonOutput output2(&out_str); + gurl_base::string16 wide_input(gurl_base::UTF8ToUTF16(port_cases[i].input)); + success = CanonicalizePort(wide_input.c_str(), + in_comp, + port_cases[i].default_port, + &output2, + &out_comp); + output2.Complete(); + + EXPECT_EQ(port_cases[i].expected_success, success); + EXPECT_EQ(std::string(port_cases[i].expected), out_str); + EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len); + } +} + +TEST(URLCanonTest, Path) { + DualComponentCase path_cases[] = { + // ----- path collapsing tests ----- + {"/././foo", L"/././foo", "/foo", Component(0, 4), true}, + {"/./.foo", L"/./.foo", "/.foo", Component(0, 5), true}, + {"/foo/.", L"/foo/.", "/foo/", Component(0, 5), true}, + {"/foo/./", L"/foo/./", "/foo/", Component(0, 5), true}, + // double dots followed by a slash or the end of the string count + {"/foo/bar/..", L"/foo/bar/..", "/foo/", Component(0, 5), true}, + {"/foo/bar/../", L"/foo/bar/../", "/foo/", Component(0, 5), true}, + // don't count double dots when they aren't followed by a slash + {"/foo/..bar", L"/foo/..bar", "/foo/..bar", Component(0, 10), true}, + // some in the middle + {"/foo/bar/../ton", L"/foo/bar/../ton", "/foo/ton", Component(0, 8), true}, + {"/foo/bar/../ton/../../a", L"/foo/bar/../ton/../../a", "/a", Component(0, 2), true}, + // we should not be able to go above the root + {"/foo/../../..", L"/foo/../../..", "/", Component(0, 1), true}, + {"/foo/../../../ton", L"/foo/../../../ton", "/ton", Component(0, 4), true}, + // escaped dots should be unescaped and treated the same as dots + {"/foo/%2e", L"/foo/%2e", "/foo/", Component(0, 5), true}, + {"/foo/%2e%2", L"/foo/%2e%2", "/foo/.%2", Component(0, 8), true}, + {"/foo/%2e./%2e%2e/.%2e/%2e.bar", L"/foo/%2e./%2e%2e/.%2e/%2e.bar", "/..bar", Component(0, 6), true}, + // Multiple slashes in a row should be preserved and treated like empty + // directory names. + {"////../..", L"////../..", "//", Component(0, 2), true}, + + // ----- escaping tests ----- + {"/foo", L"/foo", "/foo", Component(0, 4), true}, + // Valid escape sequence + {"/%20foo", L"/%20foo", "/%20foo", Component(0, 7), true}, + // Invalid escape sequence we should pass through unchanged. + {"/foo%", L"/foo%", "/foo%", Component(0, 5), true}, + {"/foo%2", L"/foo%2", "/foo%2", Component(0, 6), true}, + // Invalid escape sequence: bad characters should be treated the same as + // the sourrounding text, not as escaped (in this case, UTF-8). + {"/foo%2zbar", L"/foo%2zbar", "/foo%2zbar", Component(0, 10), true}, + {"/foo%2\xc2\xa9zbar", NULL, "/foo%2%C2%A9zbar", Component(0, 16), true}, + {NULL, L"/foo%2\xc2\xa9zbar", "/foo%2%C3%82%C2%A9zbar", Component(0, 22), true}, + // Regular characters that are escaped should be unescaped + {"/foo%41%7a", L"/foo%41%7a", "/fooAz", Component(0, 6), true}, + // Funny characters that are unescaped should be escaped + {"/foo\x09\x91%91", NULL, "/foo%09%91%91", Component(0, 13), true}, + {NULL, L"/foo\x09\x91%91", "/foo%09%C2%91%91", Component(0, 16), true}, + // Invalid characters that are escaped should cause a failure. + {"/foo%00%51", L"/foo%00%51", "/foo%00Q", Component(0, 8), false}, + // Some characters should be passed through unchanged regardless of esc. + {"/(%28:%3A%29)", L"/(%28:%3A%29)", "/(%28:%3A%29)", Component(0, 13), true}, + // Characters that are properly escaped should not have the case changed + // of hex letters. + {"/%3A%3a%3C%3c", L"/%3A%3a%3C%3c", "/%3A%3a%3C%3c", Component(0, 13), true}, + // Funny characters that are unescaped should be escaped + {"/foo\tbar", L"/foo\tbar", "/foo%09bar", Component(0, 10), true}, + // Backslashes should get converted to forward slashes + {"\\foo\\bar", L"\\foo\\bar", "/foo/bar", Component(0, 8), true}, + // Hashes found in paths (possibly only when the caller explicitly sets + // the path on an already-parsed URL) should be escaped. + {"/foo#bar", L"/foo#bar", "/foo%23bar", Component(0, 10), true}, + // %7f should be allowed and %3D should not be unescaped (these were wrong + // in a previous version). + {"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd", "/%7Ffp3%3Eju%3Dduvgw%3Dd", Component(0, 24), true}, + // @ should be passed through unchanged (escaped or unescaped). + {"/@asdf%40", L"/@asdf%40", "/@asdf%40", Component(0, 9), true}, + // Nested escape sequences should result in escaping the leading '%' if + // unescaping would result in a new escape sequence. + {"/%A%42", L"/%A%42", "/%25AB", Component(0, 6), true}, + {"/%%41B", L"/%%41B", "/%25AB", Component(0, 6), true}, + {"/%%41%42", L"/%%41%42", "/%25AB", Component(0, 6), true}, + // Make sure truncated "nested" escapes don't result in reading off the + // string end. + {"/%%41", L"/%%41", "/%A", Component(0, 3), true}, + // Don't unescape the leading '%' if unescaping doesn't result in a valid + // new escape sequence. + {"/%%470", L"/%%470", "/%G0", Component(0, 4), true}, + {"/%%2D%41", L"/%%2D%41", "/%-A", Component(0, 4), true}, + // Don't erroneously downcast a UTF-16 charater in a way that makes it + // look like part of an escape sequence. + {NULL, L"/%%41\x0130", "/%A%C4%B0", Component(0, 9), true}, + + // ----- encoding tests ----- + // Basic conversions + {"/\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"/\x4f60\x597d\x4f60\x597d", "/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD", Component(0, 37), true}, + // Invalid unicode characters should fail. We only do validation on + // UTF-16 input, so this doesn't happen on 8-bit. + {"/\xef\xb7\x90zyx", NULL, "/%EF%B7%90zyx", Component(0, 13), true}, + {NULL, L"/\xfdd0zyx", "/%EF%BF%BDzyx", Component(0, 13), false}, + }; + + for (size_t i = 0; i < gurl_base::size(path_cases); i++) { + if (path_cases[i].input8) { + int len = static_cast<int>(strlen(path_cases[i].input8)); + Component in_comp(0, len); + Component out_comp; + std::string out_str; + StdStringCanonOutput output(&out_str); + bool success = + CanonicalizePath(path_cases[i].input8, in_comp, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(path_cases[i].expected_success, success); + EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(path_cases[i].expected, out_str); + } + + if (path_cases[i].input16) { + gurl_base::string16 input16( + test_utils::TruncateWStringToUTF16(path_cases[i].input16)); + int len = static_cast<int>(input16.length()); + Component in_comp(0, len); + Component out_comp; + std::string out_str; + StdStringCanonOutput output(&out_str); + + bool success = + CanonicalizePath(input16.c_str(), in_comp, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(path_cases[i].expected_success, success); + EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(path_cases[i].expected, out_str); + } + } + + // Manual test: embedded NULLs should be escaped and the URL should be marked + // as invalid. + const char path_with_null[] = "/ab\0c"; + Component in_comp(0, 5); + Component out_comp; + + std::string out_str; + StdStringCanonOutput output(&out_str); + bool success = CanonicalizePath(path_with_null, in_comp, &output, &out_comp); + output.Complete(); + EXPECT_FALSE(success); + EXPECT_EQ("/ab%00c", out_str); +} + +TEST(URLCanonTest, Query) { + struct QueryCase { + const char* input8; + const wchar_t* input16; + const char* expected; + } query_cases[] = { + // Regular ASCII case. + {"foo=bar", L"foo=bar", "?foo=bar"}, + // Allow question marks in the query without escaping + {"as?df", L"as?df", "?as?df"}, + // Always escape '#' since it would mark the ref. + {"as#df", L"as#df", "?as%23df"}, + // Escape some questionable 8-bit characters, but never unescape. + {"\x02hello\x7f bye", L"\x02hello\x7f bye", "?%02hello%7F%20bye"}, + {"%40%41123", L"%40%41123", "?%40%41123"}, + // Chinese input/output + {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "?q=%E4%BD%A0%E5%A5%BD"}, + // Invalid UTF-8/16 input should be replaced with invalid characters. + {"q=\xed\xed", L"q=\xd800\xd800", "?q=%EF%BF%BD%EF%BF%BD"}, + // Don't allow < or > because sometimes they are used for XSS if the + // URL is echoed in content. Firefox does this, IE doesn't. + {"q=<asdf>", L"q=<asdf>", "?q=%3Casdf%3E"}, + // Escape double quotemarks in the query. + {"q=\"asdf\"", L"q=\"asdf\"", "?q=%22asdf%22"}, + }; + + for (size_t i = 0; i < gurl_base::size(query_cases); i++) { + Component out_comp; + + if (query_cases[i].input8) { + int len = static_cast<int>(strlen(query_cases[i].input8)); + Component in_comp(0, len); + std::string out_str; + + StdStringCanonOutput output(&out_str); + CanonicalizeQuery(query_cases[i].input8, in_comp, NULL, &output, + &out_comp); + output.Complete(); + + EXPECT_EQ(query_cases[i].expected, out_str); + } + + if (query_cases[i].input16) { + gurl_base::string16 input16( + test_utils::TruncateWStringToUTF16(query_cases[i].input16)); + int len = static_cast<int>(input16.length()); + Component in_comp(0, len); + std::string out_str; + + StdStringCanonOutput output(&out_str); + CanonicalizeQuery(input16.c_str(), in_comp, NULL, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(query_cases[i].expected, out_str); + } + } + + // Extra test for input with embedded NULL; + std::string out_str; + StdStringCanonOutput output(&out_str); + Component out_comp; + CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp); + output.Complete(); + EXPECT_EQ("?a%20%00z%01", out_str); +} + +TEST(URLCanonTest, Ref) { + // Refs are trivial, it just checks the encoding. + DualComponentCase ref_cases[] = { + {"hello!", L"hello!", "#hello!", Component(1, 6), true}, + // We should escape spaces, double-quotes, angled braces, and backtics. + {"hello, world", L"hello, world", "#hello,%20world", Component(1, 14), + true}, + {"hello,\"world", L"hello,\"world", "#hello,%22world", Component(1, 14), + true}, + {"hello,<world", L"hello,<world", "#hello,%3Cworld", Component(1, 14), + true}, + {"hello,>world", L"hello,>world", "#hello,%3Eworld", Component(1, 14), + true}, + {"hello,`world", L"hello,`world", "#hello,%60world", Component(1, 14), + true}, + // UTF-8/wide input should be preserved + {"\xc2\xa9", L"\xa9", "#%C2%A9", Component(1, 6), true}, + // Test a characer that takes > 16 bits (U+10300 = old italic letter A) + {"\xF0\x90\x8C\x80ss", L"\xd800\xdf00ss", "#%F0%90%8C%80ss", + Component(1, 14), true}, + // Escaping should be preserved unchanged, even invalid ones + {"%41%a", L"%41%a", "#%41%a", Component(1, 5), true}, + // Invalid UTF-8/16 input should be flagged and the input made valid + {"\xc2", NULL, "#%EF%BF%BD", Component(1, 9), true}, + {NULL, L"\xd800\x597d", "#%EF%BF%BD%E5%A5%BD", Component(1, 18), true}, + // Test a Unicode invalid character. + {"a\xef\xb7\x90", L"a\xfdd0", "#a%EF%BF%BD", Component(1, 10), true}, + // Refs can have # signs and we should preserve them. + {"asdf#qwer", L"asdf#qwer", "#asdf#qwer", Component(1, 9), true}, + {"#asdf", L"#asdf", "##asdf", Component(1, 5), true}, + }; + + for (size_t i = 0; i < gurl_base::size(ref_cases); i++) { + // 8-bit input + if (ref_cases[i].input8) { + int len = static_cast<int>(strlen(ref_cases[i].input8)); + Component in_comp(0, len); + Component out_comp; + + std::string out_str; + StdStringCanonOutput output(&out_str); + CanonicalizeRef(ref_cases[i].input8, in_comp, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(ref_cases[i].expected, out_str); + } + + // 16-bit input + if (ref_cases[i].input16) { + gurl_base::string16 input16( + test_utils::TruncateWStringToUTF16(ref_cases[i].input16)); + int len = static_cast<int>(input16.length()); + Component in_comp(0, len); + Component out_comp; + + std::string out_str; + StdStringCanonOutput output(&out_str); + CanonicalizeRef(input16.c_str(), in_comp, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(ref_cases[i].expected, out_str); + } + } + + // Try one with an embedded NULL. It should be stripped. + const char null_input[5] = "ab\x00z"; + Component null_input_component(0, 4); + Component out_comp; + + std::string out_str; + StdStringCanonOutput output(&out_str); + CanonicalizeRef(null_input, null_input_component, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(1, out_comp.begin); + EXPECT_EQ(3, out_comp.len); + EXPECT_EQ("#abz", out_str); +} + +TEST(URLCanonTest, CanonicalizeStandardURL) { + // The individual component canonicalize tests should have caught the cases + // for each of those components. Here, we just need to test that the various + // parts are included or excluded properly, and have the correct separators. + struct URLCase { + const char* input; + const char* expected; + bool expected_success; + } cases[] = { + {"http://www.google.com/foo?bar=baz#", + "http://www.google.com/foo?bar=baz#", true}, + {"http://[www.google.com]/", "http://[www.google.com]/", false}, + {"ht\ttp:@www.google.com:80/;p?#", "ht%09tp://www.google.com:80/;p?#", + false}, + {"http:////////user:@google.com:99?foo", "http://user@google.com:99/?foo", + true}, + {"www.google.com", ":www.google.com/", false}, + {"http://192.0x00A80001", "http://192.168.0.1/", true}, + {"http://www/foo%2Ehtml", "http://www/foo.html", true}, + {"http://user:pass@/", "http://user:pass@/", false}, + {"http://%25DOMAIN:foobar@foodomain.com/", + "http://%25DOMAIN:foobar@foodomain.com/", true}, + + // Backslashes should get converted to forward slashes. + {"http:\\\\www.google.com\\foo", "http://www.google.com/foo", true}, + + // Busted refs shouldn't make the whole thing fail. + {"http://www.google.com/asdf#\xc2", + "http://www.google.com/asdf#%EF%BF%BD", true}, + + // Basic port tests. + {"http://foo:80/", "http://foo/", true}, + {"http://foo:81/", "http://foo:81/", true}, + {"httpa://foo:80/", "httpa://foo:80/", true}, + {"http://foo:-80/", "http://foo:-80/", false}, + + {"https://foo:443/", "https://foo/", true}, + {"https://foo:80/", "https://foo:80/", true}, + {"ftp://foo:21/", "ftp://foo/", true}, + {"ftp://foo:80/", "ftp://foo:80/", true}, + {"gopher://foo:70/", "gopher://foo/", true}, + {"gopher://foo:443/", "gopher://foo:443/", true}, + {"ws://foo:80/", "ws://foo/", true}, + {"ws://foo:81/", "ws://foo:81/", true}, + {"ws://foo:443/", "ws://foo:443/", true}, + {"ws://foo:815/", "ws://foo:815/", true}, + {"wss://foo:80/", "wss://foo:80/", true}, + {"wss://foo:81/", "wss://foo:81/", true}, + {"wss://foo:443/", "wss://foo/", true}, + {"wss://foo:815/", "wss://foo:815/", true}, + + // This particular code path ends up "backing up" to replace an invalid + // host ICU generated with an escaped version. Test that in the context + // of a full URL to make sure the backing up doesn't mess up the non-host + // parts of the URL. "EF B9 AA" is U+FE6A which is a type of percent that + // ICU will convert to an ASCII one, generating "%81". + {"ws:)W\x1eW\xef\xb9\xaa" + "81:80/", + "ws://%29w%1ew%81/", false}, + }; + + for (size_t i = 0; i < gurl_base::size(cases); i++) { + int url_len = static_cast<int>(strlen(cases[i].input)); + Parsed parsed; + ParseStandardURL(cases[i].input, url_len, &parsed); + + Parsed out_parsed; + std::string out_str; + StdStringCanonOutput output(&out_str); + bool success = CanonicalizeStandardURL( + cases[i].input, url_len, parsed, + SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(cases[i].expected_success, success); + EXPECT_EQ(cases[i].expected, out_str); + } +} + +// The codepath here is the same as for regular canonicalization, so we just +// need to test that things are replaced or not correctly. +TEST(URLCanonTest, ReplaceStandardURL) { + ReplaceCase replace_cases[] = { + // Common case of truncating the path. + {"http://www.google.com/foo?bar=baz#ref", NULL, NULL, NULL, NULL, NULL, "/", kDeleteComp, kDeleteComp, "http://www.google.com/"}, + // Replace everything + {"http://a:b@google.com:22/foo;bar?baz@cat", "https", "me", "pw", "host.com", "99", "/path", "query", "ref", "https://me:pw@host.com:99/path?query#ref"}, + // Replace nothing + {"http://a:b@google.com:22/foo?baz@cat", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "http://a:b@google.com:22/foo?baz@cat"}, + // Replace scheme with filesystem. The result is garbage, but you asked + // for it. + {"http://a:b@google.com:22/foo?baz@cat", "filesystem", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "filesystem://a:b@google.com:22/foo?baz@cat"}, + }; + + for (size_t i = 0; i < gurl_base::size(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast<int>(strlen(cur.base)); + Parsed parsed; + ParseStandardURL(cur.base, base_len, &parsed); + + Replacements<char> r; + typedef Replacements<char> R; // Clean up syntax. + + // Note that for the scheme we pass in a different clear function since + // there is no function to clear the scheme. + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + StdStringCanonOutput output(&out_str); + Parsed out_parsed; + ReplaceStandardURL(replace_cases[i].base, parsed, r, + SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL, + &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } + + // The path pointer should be ignored if the address is invalid. + { + const char src[] = "http://www.google.com/here_is_the_path"; + int src_len = static_cast<int>(strlen(src)); + + Parsed parsed; + ParseStandardURL(src, src_len, &parsed); + + // Replace the path to 0 length string. By using 1 as the string address, + // the test should get an access violation if it tries to dereference it. + Replacements<char> r; + r.SetPath(reinterpret_cast<char*>(0x00000001), Component(0, 0)); + std::string out_str1; + StdStringCanonOutput output1(&out_str1); + Parsed new_parsed; + ReplaceStandardURL(src, parsed, r, + SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL, + &output1, &new_parsed); + output1.Complete(); + EXPECT_STREQ("http://www.google.com/", out_str1.c_str()); + + // Same with an "invalid" path. + r.SetPath(reinterpret_cast<char*>(0x00000001), Component()); + std::string out_str2; + StdStringCanonOutput output2(&out_str2); + ReplaceStandardURL(src, parsed, r, + SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL, + &output2, &new_parsed); + output2.Complete(); + EXPECT_STREQ("http://www.google.com/", out_str2.c_str()); + } +} + +TEST(URLCanonTest, ReplaceFileURL) { + ReplaceCase replace_cases[] = { + // Replace everything + {"file:///C:/gaba?query#ref", NULL, NULL, NULL, "filer", NULL, "/foo", "b", "c", "file://filer/foo?b#c"}, + // Replace nothing + {"file:///C:/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///C:/gaba?query#ref"}, + // Clear non-path components (common) + {"file:///C:/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "file:///C:/gaba"}, + // Replace path with something that doesn't begin with a slash and make + // sure it gets added properly. + {"file:///C:/gaba", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "file:///interesting/"}, + {"file:///home/gaba?query#ref", NULL, NULL, NULL, "filer", NULL, "/foo", "b", "c", "file://filer/foo?b#c"}, + {"file:///home/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///home/gaba?query#ref"}, + {"file:///home/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "file:///home/gaba"}, + {"file:///home/gaba", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "file:///interesting/"}, + // Replace scheme -- shouldn't do anything. + {"file:///C:/gaba?query#ref", "http", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///C:/gaba?query#ref"}, + }; + + for (size_t i = 0; i < gurl_base::size(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast<int>(strlen(cur.base)); + Parsed parsed; + ParseFileURL(cur.base, base_len, &parsed); + + Replacements<char> r; + typedef Replacements<char> R; // Clean up syntax. + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + StdStringCanonOutput output(&out_str); + Parsed out_parsed; + ReplaceFileURL(cur.base, parsed, r, NULL, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } +} + +TEST(URLCanonTest, ReplaceFileSystemURL) { + ReplaceCase replace_cases[] = { + // Replace everything in the outer URL. + {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL, + NULL, "/foo", "b", "c", "filesystem:file:///temporary/foo?b#c"}, + // Replace nothing + {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, "filesystem:file:///temporary/gaba?query#ref"}, + // Clear non-path components (common) + {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL, + NULL, NULL, kDeleteComp, kDeleteComp, + "filesystem:file:///temporary/gaba"}, + // Replace path with something that doesn't begin with a slash and make + // sure it gets added properly. + {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL, + NULL, "interesting/", NULL, NULL, + "filesystem:file:///temporary/interesting/?query#ref"}, + // Replace scheme -- shouldn't do anything except canonicalize. + {"filesystem:http://u:p@bar.com/t/gaba?query#ref", "http", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, + "filesystem:http://bar.com/t/gaba?query#ref"}, + // Replace username -- shouldn't do anything except canonicalize. + {"filesystem:http://u:p@bar.com/t/gaba?query#ref", NULL, "u2", NULL, NULL, + NULL, NULL, NULL, NULL, "filesystem:http://bar.com/t/gaba?query#ref"}, + // Replace password -- shouldn't do anything except canonicalize. + {"filesystem:http://u:p@bar.com/t/gaba?query#ref", NULL, NULL, "pw2", + NULL, NULL, NULL, NULL, NULL, + "filesystem:http://bar.com/t/gaba?query#ref"}, + // Replace host -- shouldn't do anything except canonicalize. + {"filesystem:http://u:p@bar.com:80/t/gaba?query#ref", NULL, NULL, NULL, + "foo.com", NULL, NULL, NULL, NULL, + "filesystem:http://bar.com/t/gaba?query#ref"}, + // Replace port -- shouldn't do anything except canonicalize. + {"filesystem:http://u:p@bar.com:40/t/gaba?query#ref", NULL, NULL, NULL, + NULL, "41", NULL, NULL, NULL, + "filesystem:http://bar.com:40/t/gaba?query#ref"}, + }; + + for (size_t i = 0; i < gurl_base::size(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast<int>(strlen(cur.base)); + Parsed parsed; + ParseFileSystemURL(cur.base, base_len, &parsed); + + Replacements<char> r; + typedef Replacements<char> R; // Clean up syntax. + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + StdStringCanonOutput output(&out_str); + Parsed out_parsed; + ReplaceFileSystemURL(cur.base, parsed, r, NULL, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } +} + +TEST(URLCanonTest, ReplacePathURL) { + ReplaceCase replace_cases[] = { + // Replace everything + {"data:foo", "javascript", NULL, NULL, NULL, NULL, "alert('foo?');", NULL, NULL, "javascript:alert('foo?');"}, + // Replace nothing + {"data:foo", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "data:foo"}, + // Replace one or the other + {"data:foo", "javascript", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "javascript:foo"}, + {"data:foo", NULL, NULL, NULL, NULL, NULL, "bar", NULL, NULL, "data:bar"}, + {"data:foo", NULL, NULL, NULL, NULL, NULL, kDeleteComp, NULL, NULL, "data:"}, + }; + + for (size_t i = 0; i < gurl_base::size(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast<int>(strlen(cur.base)); + Parsed parsed; + ParsePathURL(cur.base, base_len, false, &parsed); + + Replacements<char> r; + typedef Replacements<char> R; // Clean up syntax. + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + StdStringCanonOutput output(&out_str); + Parsed out_parsed; + ReplacePathURL(cur.base, parsed, r, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } +} + +TEST(URLCanonTest, ReplaceMailtoURL) { + ReplaceCase replace_cases[] = { + // Replace everything + {"mailto:jon@foo.com?body=sup", "mailto", NULL, NULL, NULL, NULL, "addr1", "to=tony", NULL, "mailto:addr1?to=tony"}, + // Replace nothing + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "mailto:jon@foo.com?body=sup"}, + // Replace the path + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", NULL, NULL, "mailto:jason?body=sup"}, + // Replace the query + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "custom=1", NULL, "mailto:jon@foo.com?custom=1"}, + // Replace the path and query + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", "custom=1", NULL, "mailto:jason?custom=1"}, + // Set the query to empty (should leave trailing question mark) + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "", NULL, "mailto:jon@foo.com?"}, + // Clear the query + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "|", NULL, "mailto:jon@foo.com"}, + // Clear the path + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "|", NULL, NULL, "mailto:?body=sup"}, + // Clear the path + query + {"mailto:", NULL, NULL, NULL, NULL, NULL, "|", "|", NULL, "mailto:"}, + // Setting the ref should have no effect + {"mailto:addr1", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "BLAH", "mailto:addr1"}, + }; + + for (size_t i = 0; i < gurl_base::size(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast<int>(strlen(cur.base)); + Parsed parsed; + ParseMailtoURL(cur.base, base_len, &parsed); + + Replacements<char> r; + typedef Replacements<char> R; + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + StdStringCanonOutput output(&out_str); + Parsed out_parsed; + ReplaceMailtoURL(cur.base, parsed, r, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } +} + +TEST(URLCanonTest, CanonicalizeFileURL) { + struct URLCase { + const char* input; + const char* expected; + bool expected_success; + Component expected_host; + Component expected_path; + } cases[] = { +#ifdef _WIN32 + // Windows-style paths + {"file:c:\\foo\\bar.html", "file:///C:/foo/bar.html", true, Component(), + Component(7, 16)}, + {" File:c|////foo\\bar.html", "file:///C:////foo/bar.html", true, + Component(), Component(7, 19)}, + {"file:", "file:///", true, Component(), Component(7, 1)}, + {"file:UNChost/path", "file://unchost/path", true, Component(7, 7), + Component(14, 5)}, + // CanonicalizeFileURL supports absolute Windows style paths for IE + // compatibility. Note that the caller must decide that this is a file + // URL itself so it can call the file canonicalizer. This is usually + // done automatically as part of relative URL resolving. + {"c:\\foo\\bar", "file:///C:/foo/bar", true, Component(), + Component(7, 11)}, + {"C|/foo/bar", "file:///C:/foo/bar", true, Component(), Component(7, 11)}, + {"/C|\\foo\\bar", "file:///C:/foo/bar", true, Component(), + Component(7, 11)}, + {"//C|/foo/bar", "file:///C:/foo/bar", true, Component(), + Component(7, 11)}, + {"//server/file", "file://server/file", true, Component(7, 6), + Component(13, 5)}, + {"\\\\server\\file", "file://server/file", true, Component(7, 6), + Component(13, 5)}, + {"/\\server/file", "file://server/file", true, Component(7, 6), + Component(13, 5)}, + // We should preserve the number of slashes after the colon for IE + // compatibility, except when there is none, in which case we should + // add one. + {"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, Component(), + Component(7, 16)}, + {"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true, + Component(), Component(7, 19)}, + // Three slashes should be non-UNC, even if there is no drive spec (IE + // does this, which makes the resulting request invalid). + {"file:///foo/bar.txt", "file:///foo/bar.txt", true, Component(), + Component(7, 12)}, + // TODO(brettw) we should probably fail for invalid host names, which + // would change the expected result on this test. We also currently allow + // colon even though it's probably invalid, because its currently the + // "natural" result of the way the canonicalizer is written. There doesn't + // seem to be a strong argument for why allowing it here would be bad, so + // we just tolerate it and the load will fail later. + {"FILE:/\\/\\7:\\\\//foo\\bar.html", "file://7:////foo/bar.html", false, + Component(7, 2), Component(9, 16)}, + {"file:filer/home\\me", "file://filer/home/me", true, Component(7, 5), + Component(12, 8)}, + // Make sure relative paths can't go above the "C:" + {"file:///C:/foo/../../../bar.html", "file:///C:/bar.html", true, + Component(), Component(7, 12)}, + // Busted refs shouldn't make the whole thing fail. + {"file:///C:/asdf#\xc2", "file:///C:/asdf#%EF%BF%BD", true, Component(), + Component(7, 8)}, +#else + // Unix-style paths + {"file:///home/me", "file:///home/me", true, Component(), Component(7, 8)}, + // Windowsy ones should get still treated as Unix-style. + {"file:c:\\foo\\bar.html", "file:///c:/foo/bar.html", true, Component(), Component(7, 16)}, + {"file:c|//foo\\bar.html", "file:///c%7C//foo/bar.html", true, Component(), Component(7, 19)}, + // file: tests from WebKit (LayoutTests/fast/loader/url-parse-1.html) + {"//", "file:///", true, Component(), Component(7, 1)}, + {"///", "file:///", true, Component(), Component(7, 1)}, + {"///test", "file:///test", true, Component(), Component(7, 5)}, + {"file://test", "file://test/", true, Component(7, 4), Component(11, 1)}, + {"file://localhost", "file://localhost/", true, Component(7, 9), Component(16, 1)}, + {"file://localhost/", "file://localhost/", true, Component(7, 9), Component(16, 1)}, + {"file://localhost/test", "file://localhost/test", true, Component(7, 9), Component(16, 5)}, +#endif // _WIN32 + }; + + for (size_t i = 0; i < gurl_base::size(cases); i++) { + int url_len = static_cast<int>(strlen(cases[i].input)); + Parsed parsed; + ParseFileURL(cases[i].input, url_len, &parsed); + + Parsed out_parsed; + std::string out_str; + StdStringCanonOutput output(&out_str); + bool success = CanonicalizeFileURL(cases[i].input, url_len, parsed, NULL, + &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(cases[i].expected_success, success); + EXPECT_EQ(cases[i].expected, out_str); + + // Make sure the spec was properly identified, the file canonicalizer has + // different code for writing the spec. + EXPECT_EQ(0, out_parsed.scheme.begin); + EXPECT_EQ(4, out_parsed.scheme.len); + + EXPECT_EQ(cases[i].expected_host.begin, out_parsed.host.begin); + EXPECT_EQ(cases[i].expected_host.len, out_parsed.host.len); + + EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin); + EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len); + } +} + +TEST(URLCanonTest, CanonicalizeFileSystemURL) { + struct URLCase { + const char* input; + const char* expected; + bool expected_success; + } cases[] = { + {"Filesystem:htTp://www.Foo.com:80/tempoRary", "filesystem:http://www.foo.com/tempoRary/", true}, + {"filesystem:httpS://www.foo.com/temporary/", "filesystem:https://www.foo.com/temporary/", true}, + {"filesystem:http://www.foo.com//", "filesystem:http://www.foo.com//", false}, + {"filesystem:http://www.foo.com/persistent/bob?query#ref", "filesystem:http://www.foo.com/persistent/bob?query#ref", true}, + {"filesystem:fIle://\\temporary/", "filesystem:file:///temporary/", true}, + {"filesystem:fiLe:///temporary", "filesystem:file:///temporary/", true}, + {"filesystem:File:///temporary/Bob?qUery#reF", "filesystem:file:///temporary/Bob?qUery#reF", true}, + }; + + for (size_t i = 0; i < gurl_base::size(cases); i++) { + int url_len = static_cast<int>(strlen(cases[i].input)); + Parsed parsed; + ParseFileSystemURL(cases[i].input, url_len, &parsed); + + Parsed out_parsed; + std::string out_str; + StdStringCanonOutput output(&out_str); + bool success = CanonicalizeFileSystemURL(cases[i].input, url_len, parsed, + NULL, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(cases[i].expected_success, success); + EXPECT_EQ(cases[i].expected, out_str); + + // Make sure the spec was properly identified, the filesystem canonicalizer + // has different code for writing the spec. + EXPECT_EQ(0, out_parsed.scheme.begin); + EXPECT_EQ(10, out_parsed.scheme.len); + if (success) + EXPECT_GT(out_parsed.path.len, 0); + } +} + +TEST(URLCanonTest, CanonicalizePathURL) { + // Path URLs should get canonicalized schemes but nothing else. + struct PathCase { + const char* input; + const char* expected; + } path_cases[] = { + {"javascript:", "javascript:"}, + {"JavaScript:Foo", "javascript:Foo"}, + {"Foo:\":This /is interesting;?#", "foo:\":This /is interesting;?#"}, + + // Validation errors should not cause failure. See + // https://crbug.com/925614. + {"javascript:\uFFFF", "javascript:%EF%BF%BD"}, + }; + + for (size_t i = 0; i < gurl_base::size(path_cases); i++) { + int url_len = static_cast<int>(strlen(path_cases[i].input)); + Parsed parsed; + ParsePathURL(path_cases[i].input, url_len, true, &parsed); + + Parsed out_parsed; + std::string out_str; + StdStringCanonOutput output(&out_str); + bool success = CanonicalizePathURL(path_cases[i].input, url_len, parsed, + &output, &out_parsed); + output.Complete(); + + EXPECT_TRUE(success); + EXPECT_EQ(path_cases[i].expected, out_str); + + EXPECT_EQ(0, out_parsed.host.begin); + EXPECT_EQ(-1, out_parsed.host.len); + + // When we end with a colon at the end, there should be no path. + if (path_cases[i].input[url_len - 1] == ':') { + EXPECT_EQ(0, out_parsed.GetContent().begin); + EXPECT_EQ(-1, out_parsed.GetContent().len); + } + } +} + +TEST(URLCanonTest, CanonicalizeMailtoURL) { + struct URLCase { + const char* input; + const char* expected; + bool expected_success; + Component expected_path; + Component expected_query; + } cases[] = { + // Null character should be escaped to %00. + // Keep this test first in the list as it is handled specially below. + {"mailto:addr1\0addr2?foo", + "mailto:addr1%00addr2?foo", + true, Component(7, 13), Component(21, 3)}, + {"mailto:addr1", + "mailto:addr1", + true, Component(7, 5), Component()}, + {"mailto:addr1@foo.com", + "mailto:addr1@foo.com", + true, Component(7, 13), Component()}, + // Trailing whitespace is stripped. + {"MaIlTo:addr1 \t ", + "mailto:addr1", + true, Component(7, 5), Component()}, + {"MaIlTo:addr1?to=jon", + "mailto:addr1?to=jon", + true, Component(7, 5), Component(13,6)}, + {"mailto:addr1,addr2", + "mailto:addr1,addr2", + true, Component(7, 11), Component()}, + // Embedded spaces must be encoded. + {"mailto:addr1, addr2", + "mailto:addr1,%20addr2", + true, Component(7, 14), Component()}, + {"mailto:addr1, addr2?subject=one two ", + "mailto:addr1,%20addr2?subject=one%20two", + true, Component(7, 14), Component(22, 17)}, + {"mailto:addr1%2caddr2", + "mailto:addr1%2caddr2", + true, Component(7, 13), Component()}, + {"mailto:\xF0\x90\x8C\x80", + "mailto:%F0%90%8C%80", + true, Component(7, 12), Component()}, + // Invalid -- UTF-8 encoded surrogate value. + {"mailto:\xed\xa0\x80", + "mailto:%EF%BF%BD%EF%BF%BD%EF%BF%BD", + false, Component(7, 27), Component()}, + {"mailto:addr1?", + "mailto:addr1?", + true, Component(7, 5), Component(13, 0)}, + // Certain characters have special meanings and must be encoded. + {"mailto:! \x22$&()+,-./09:;<=>@AZ[\\]&_`az{|}~\x7f?Query! \x22$&()+,-./09:;<=>@AZ[\\]&_`az{|}~", + "mailto:!%20%22$&()+,-./09:;%3C=%3E@AZ[\\]&_%60az%7B%7C%7D~%7F?Query!%20%22$&()+,-./09:;%3C=%3E@AZ[\\]&_`az{|}~", + true, Component(7, 53), Component(61, 47)}, + }; + + // Define outside of loop to catch bugs where components aren't reset + Parsed parsed; + Parsed out_parsed; + + for (size_t i = 0; i < gurl_base::size(cases); i++) { + int url_len = static_cast<int>(strlen(cases[i].input)); + if (i == 0) { + // The first test case purposely has a '\0' in it -- don't count it + // as the string terminator. + url_len = 22; + } + ParseMailtoURL(cases[i].input, url_len, &parsed); + + std::string out_str; + StdStringCanonOutput output(&out_str); + bool success = CanonicalizeMailtoURL(cases[i].input, url_len, parsed, + &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(cases[i].expected_success, success); + EXPECT_EQ(cases[i].expected, out_str); + + // Make sure the spec was properly identified + EXPECT_EQ(0, out_parsed.scheme.begin); + EXPECT_EQ(6, out_parsed.scheme.len); + + EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin); + EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len); + + EXPECT_EQ(cases[i].expected_query.begin, out_parsed.query.begin); + EXPECT_EQ(cases[i].expected_query.len, out_parsed.query.len); + } +} + +#ifndef WIN32 + +TEST(URLCanonTest, _itoa_s) { + // We fill the buffer with 0xff to ensure that it's getting properly + // null-terminated. We also allocate one byte more than what we tell + // _itoa_s about, and ensure that the extra byte is untouched. + char buf[6]; + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, _itoa_s(12, buf, sizeof(buf) - 1, 10)); + EXPECT_STREQ("12", buf); + EXPECT_EQ('\xFF', buf[3]); + + // Test the edge cases - exactly the buffer size and one over + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 10)); + EXPECT_STREQ("1234", buf); + EXPECT_EQ('\xFF', buf[5]); + + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(EINVAL, _itoa_s(12345, buf, sizeof(buf) - 1, 10)); + EXPECT_EQ('\xFF', buf[5]); // should never write to this location + + // Test the template overload (note that this will see the full buffer) + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, _itoa_s(12, buf, 10)); + EXPECT_STREQ("12", buf); + EXPECT_EQ('\xFF', buf[3]); + + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, _itoa_s(12345, buf, 10)); + EXPECT_STREQ("12345", buf); + + EXPECT_EQ(EINVAL, _itoa_s(123456, buf, 10)); + + // Test that radix 16 is supported. + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 16)); + EXPECT_STREQ("4d2", buf); + EXPECT_EQ('\xFF', buf[5]); +} + +TEST(URLCanonTest, _itow_s) { + // We fill the buffer with 0xff to ensure that it's getting properly + // null-terminated. We also allocate one byte more than what we tell + // _itoa_s about, and ensure that the extra byte is untouched. + gurl_base::char16 buf[6]; + const char fill_mem = 0xff; + const gurl_base::char16 fill_char = 0xffff; + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(0, _itow_s(12, buf, sizeof(buf) / 2 - 1, 10)); + EXPECT_EQ(gurl_base::UTF8ToUTF16("12"), gurl_base::string16(buf)); + EXPECT_EQ(fill_char, buf[3]); + + // Test the edge cases - exactly the buffer size and one over + EXPECT_EQ(0, _itow_s(1234, buf, sizeof(buf) / 2 - 1, 10)); + EXPECT_EQ(gurl_base::UTF8ToUTF16("1234"), gurl_base::string16(buf)); + EXPECT_EQ(fill_char, buf[5]); + + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(EINVAL, _itow_s(12345, buf, sizeof(buf) / 2 - 1, 10)); + EXPECT_EQ(fill_char, buf[5]); // should never write to this location + + // Test the template overload (note that this will see the full buffer) + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(0, _itow_s(12, buf, 10)); + EXPECT_EQ(gurl_base::UTF8ToUTF16("12"), + gurl_base::string16(buf)); + EXPECT_EQ(fill_char, buf[3]); + + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(0, _itow_s(12345, buf, 10)); + EXPECT_EQ(gurl_base::UTF8ToUTF16("12345"), gurl_base::string16(buf)); + + EXPECT_EQ(EINVAL, _itow_s(123456, buf, 10)); +} + +#endif // !WIN32 + +// Returns true if the given two structures are the same. +static bool ParsedIsEqual(const Parsed& a, const Parsed& b) { + return a.scheme.begin == b.scheme.begin && a.scheme.len == b.scheme.len && + a.username.begin == b.username.begin && a.username.len == b.username.len && + a.password.begin == b.password.begin && a.password.len == b.password.len && + a.host.begin == b.host.begin && a.host.len == b.host.len && + a.port.begin == b.port.begin && a.port.len == b.port.len && + a.path.begin == b.path.begin && a.path.len == b.path.len && + a.query.begin == b.query.begin && a.query.len == b.query.len && + a.ref.begin == b.ref.begin && a.ref.len == b.ref.len; +} + +TEST(URLCanonTest, ResolveRelativeURL) { + struct RelativeCase { + const char* base; // Input base URL: MUST BE CANONICAL + bool is_base_hier; // Is the base URL hierarchical + bool is_base_file; // Tells us if the base is a file URL. + const char* test; // Input URL to test against. + bool succeed_relative; // Whether we expect IsRelativeURL to succeed + bool is_rel; // Whether we expect |test| to be relative or not. + bool succeed_resolve; // Whether we expect ResolveRelativeURL to succeed. + const char* resolved; // What we expect in the result when resolving. + } rel_cases[] = { + // Basic absolute input. + {"http://host/a", true, false, "http://another/", true, false, false, NULL}, + {"http://host/a", true, false, "http:////another/", true, false, false, NULL}, + // Empty relative URLs should only remove the ref part of the URL, + // leaving the rest unchanged. + {"http://foo/bar", true, false, "", true, true, true, "http://foo/bar"}, + {"http://foo/bar#ref", true, false, "", true, true, true, "http://foo/bar"}, + {"http://foo/bar#", true, false, "", true, true, true, "http://foo/bar"}, + // Spaces at the ends of the relative path should be ignored. + {"http://foo/bar", true, false, " another ", true, true, true, "http://foo/another"}, + {"http://foo/bar", true, false, " . ", true, true, true, "http://foo/"}, + {"http://foo/bar", true, false, " \t ", true, true, true, "http://foo/bar"}, + // Matching schemes without two slashes are treated as relative. + {"http://host/a", true, false, "http:path", true, true, true, "http://host/path"}, + {"http://host/a/", true, false, "http:path", true, true, true, "http://host/a/path"}, + {"http://host/a", true, false, "http:/path", true, true, true, "http://host/path"}, + {"http://host/a", true, false, "HTTP:/path", true, true, true, "http://host/path"}, + // Nonmatching schemes are absolute. + {"http://host/a", true, false, "https:host2", true, false, false, NULL}, + {"http://host/a", true, false, "htto:/host2", true, false, false, NULL}, + // Absolute path input + {"http://host/a", true, false, "/b/c/d", true, true, true, "http://host/b/c/d"}, + {"http://host/a", true, false, "\\b\\c\\d", true, true, true, "http://host/b/c/d"}, + {"http://host/a", true, false, "/b/../c", true, true, true, "http://host/c"}, + {"http://host/a?b#c", true, false, "/b/../c", true, true, true, "http://host/c"}, + {"http://host/a", true, false, "\\b/../c?x#y", true, true, true, "http://host/c?x#y"}, + {"http://host/a?b#c", true, false, "/b/../c?x#y", true, true, true, "http://host/c?x#y"}, + // Relative path input + {"http://host/a", true, false, "b", true, true, true, "http://host/b"}, + {"http://host/a", true, false, "bc/de", true, true, true, "http://host/bc/de"}, + {"http://host/a/", true, false, "bc/de?query#ref", true, true, true, "http://host/a/bc/de?query#ref"}, + {"http://host/a/", true, false, ".", true, true, true, "http://host/a/"}, + {"http://host/a/", true, false, "..", true, true, true, "http://host/"}, + {"http://host/a/", true, false, "./..", true, true, true, "http://host/"}, + {"http://host/a/", true, false, "../.", true, true, true, "http://host/"}, + {"http://host/a/", true, false, "././.", true, true, true, "http://host/a/"}, + {"http://host/a?query#ref", true, false, "../../../foo", true, true, true, "http://host/foo"}, + // Query input + {"http://host/a", true, false, "?foo=bar", true, true, true, "http://host/a?foo=bar"}, + {"http://host/a?x=y#z", true, false, "?", true, true, true, "http://host/a?"}, + {"http://host/a?x=y#z", true, false, "?foo=bar#com", true, true, true, "http://host/a?foo=bar#com"}, + // Ref input + {"http://host/a", true, false, "#ref", true, true, true, "http://host/a#ref"}, + {"http://host/a#b", true, false, "#", true, true, true, "http://host/a#"}, + {"http://host/a?foo=bar#hello", true, false, "#bye", true, true, true, "http://host/a?foo=bar#bye"}, + // Non-hierarchical base: no relative handling. Relative input should + // error, and if a scheme is present, it should be treated as absolute. + {"data:foobar", false, false, "baz.html", false, false, false, NULL}, + {"data:foobar", false, false, "data:baz", true, false, false, NULL}, + {"data:foobar", false, false, "data:/base", true, false, false, NULL}, + // Non-hierarchical base: absolute input should succeed. + {"data:foobar", false, false, "http://host/", true, false, false, NULL}, + {"data:foobar", false, false, "http:host", true, false, false, NULL}, + // Non-hierarchical base: empty URL should give error. + {"data:foobar", false, false, "", false, false, false, NULL}, + // Invalid schemes should be treated as relative. + {"http://foo/bar", true, false, "./asd:fgh", true, true, true, "http://foo/asd:fgh"}, + {"http://foo/bar", true, false, ":foo", true, true, true, "http://foo/:foo"}, + {"http://foo/bar", true, false, " hello world", true, true, true, "http://foo/hello%20world"}, + {"data:asdf", false, false, ":foo", false, false, false, NULL}, + {"data:asdf", false, false, "bad(':foo')", false, false, false, NULL}, + // We should treat semicolons like any other character in URL resolving + {"http://host/a", true, false, ";foo", true, true, true, "http://host/;foo"}, + {"http://host/a;", true, false, ";foo", true, true, true, "http://host/;foo"}, + {"http://host/a", true, false, ";/../bar", true, true, true, "http://host/bar"}, + // Relative URLs can also be written as "//foo/bar" which is relative to + // the scheme. In this case, it would take the old scheme, so for http + // the example would resolve to "http://foo/bar". + {"http://host/a", true, false, "//another", true, true, true, "http://another/"}, + {"http://host/a", true, false, "//another/path?query#ref", true, true, true, "http://another/path?query#ref"}, + {"http://host/a", true, false, "///another/path", true, true, true, "http://another/path"}, + {"http://host/a", true, false, "//Another\\path", true, true, true, "http://another/path"}, + {"http://host/a", true, false, "//", true, true, false, "http:"}, + // IE will also allow one or the other to be a backslash to get the same + // behavior. + {"http://host/a", true, false, "\\/another/path", true, true, true, "http://another/path"}, + {"http://host/a", true, false, "/\\Another\\path", true, true, true, "http://another/path"}, +#ifdef WIN32 + // Resolving against Windows file base URLs. + {"file:///C:/foo", true, true, "http://host/", true, false, false, NULL}, + {"file:///C:/foo", true, true, "bar", true, true, true, "file:///C:/bar"}, + {"file:///C:/foo", true, true, "../../../bar.html", true, true, true, "file:///C:/bar.html"}, + {"file:///C:/foo", true, true, "/../bar.html", true, true, true, "file:///C:/bar.html"}, + // But two backslashes on Windows should be UNC so should be treated + // as absolute. + {"http://host/a", true, false, "\\\\another\\path", true, false, false, NULL}, + // IE doesn't support drive specs starting with two slashes. It fails + // immediately and doesn't even try to load. We fix it up to either + // an absolute path or UNC depending on what it looks like. + {"file:///C:/something", true, true, "//c:/foo", true, true, true, "file:///C:/foo"}, + {"file:///C:/something", true, true, "//localhost/c:/foo", true, true, true, "file:///C:/foo"}, + // Windows drive specs should be allowed and treated as absolute. + {"file:///C:/foo", true, true, "c:", true, false, false, NULL}, + {"file:///C:/foo", true, true, "c:/foo", true, false, false, NULL}, + {"http://host/a", true, false, "c:\\foo", true, false, false, NULL}, + // Relative paths with drive letters should be allowed when the base is + // also a file. + {"file:///C:/foo", true, true, "/z:/bar", true, true, true, "file:///Z:/bar"}, + // Treat absolute paths as being off of the drive. + {"file:///C:/foo", true, true, "/bar", true, true, true, "file:///C:/bar"}, + {"file://localhost/C:/foo", true, true, "/bar", true, true, true, "file://localhost/C:/bar"}, + {"file:///C:/foo/com/", true, true, "/bar", true, true, true, "file:///C:/bar"}, + // On Windows, two slashes without a drive letter when the base is a file + // means that the path is UNC. + {"file:///C:/something", true, true, "//somehost/path", true, true, true, "file://somehost/path"}, + {"file:///C:/something", true, true, "/\\//somehost/path", true, true, true, "file://somehost/path"}, +#else + // On Unix we fall back to relative behavior since there's nothing else + // reasonable to do. + {"http://host/a", true, false, "\\\\Another\\path", true, true, true, "http://another/path"}, +#endif + // Even on Windows, we don't allow relative drive specs when the base + // is not file. + {"http://host/a", true, false, "/c:\\foo", true, true, true, "http://host/c:/foo"}, + {"http://host/a", true, false, "//c:\\foo", true, true, true, "http://c/foo"}, + // Ensure that ports aren't allowed for hosts relative to a file url. + // Although the result string shows a host:port portion, the call to + // resolve the relative URL returns false, indicating parse failure, + // which is what is required. + {"file:///foo.txt", true, true, "//host:80/bar.txt", true, true, false, "file://host:80/bar.txt"}, + // Filesystem URL tests; filesystem URLs are only valid and relative if + // they have no scheme, e.g. "./index.html". There's no valid equivalent + // to http:index.html. + {"filesystem:http://host/t/path", true, false, "filesystem:http://host/t/path2", true, false, false, NULL}, + {"filesystem:http://host/t/path", true, false, "filesystem:https://host/t/path2", true, false, false, NULL}, + {"filesystem:http://host/t/path", true, false, "http://host/t/path2", true, false, false, NULL}, + {"http://host/t/path", true, false, "filesystem:http://host/t/path2", true, false, false, NULL}, + {"filesystem:http://host/t/path", true, false, "./path2", true, true, true, "filesystem:http://host/t/path2"}, + {"filesystem:http://host/t/path/", true, false, "path2", true, true, true, "filesystem:http://host/t/path/path2"}, + {"filesystem:http://host/t/path", true, false, "filesystem:http:path2", true, false, false, NULL}, + // Absolute URLs are still not relative to a non-standard base URL. + {"about:blank", false, false, "http://X/A", true, false, true, ""}, + {"about:blank", false, false, "content://content.Provider/", true, false, true, ""}, + }; + + for (size_t i = 0; i < gurl_base::size(rel_cases); i++) { + const RelativeCase& cur_case = rel_cases[i]; + + Parsed parsed; + int base_len = static_cast<int>(strlen(cur_case.base)); + if (cur_case.is_base_file) + ParseFileURL(cur_case.base, base_len, &parsed); + else if (cur_case.is_base_hier) + ParseStandardURL(cur_case.base, base_len, &parsed); + else + ParsePathURL(cur_case.base, base_len, false, &parsed); + + // First see if it is relative. + int test_len = static_cast<int>(strlen(cur_case.test)); + bool is_relative; + Component relative_component; + bool succeed_is_rel = IsRelativeURL( + cur_case.base, parsed, cur_case.test, test_len, cur_case.is_base_hier, + &is_relative, &relative_component); + + EXPECT_EQ(cur_case.succeed_relative, succeed_is_rel) << + "succeed is rel failure on " << cur_case.test; + EXPECT_EQ(cur_case.is_rel, is_relative) << + "is rel failure on " << cur_case.test; + // Now resolve it. + if (succeed_is_rel && is_relative && cur_case.is_rel) { + std::string resolved; + StdStringCanonOutput output(&resolved); + Parsed resolved_parsed; + + bool succeed_resolve = ResolveRelativeURL( + cur_case.base, parsed, cur_case.is_base_file, cur_case.test, + relative_component, NULL, &output, &resolved_parsed); + output.Complete(); + + EXPECT_EQ(cur_case.succeed_resolve, succeed_resolve); + EXPECT_EQ(cur_case.resolved, resolved) << " on " << cur_case.test; + + // Verify that the output parsed structure is the same as parsing a + // the URL freshly. + Parsed ref_parsed; + int resolved_len = static_cast<int>(resolved.size()); + if (cur_case.is_base_file) { + ParseFileURL(resolved.c_str(), resolved_len, &ref_parsed); + } else if (cur_case.is_base_hier) { + ParseStandardURL(resolved.c_str(), resolved_len, &ref_parsed); + } else { + ParsePathURL(resolved.c_str(), resolved_len, false, &ref_parsed); + } + EXPECT_TRUE(ParsedIsEqual(ref_parsed, resolved_parsed)); + } + } +} + +// It used to be the case that when we did a replacement with a long buffer of +// UTF-16 characters, we would get invalid data in the URL. This is because the +// buffer that it used to hold the UTF-8 data was resized, while some pointers +// were still kept to the old buffer that was removed. +TEST(URLCanonTest, ReplacementOverflow) { + const char src[] = "file:///C:/foo/bar"; + int src_len = static_cast<int>(strlen(src)); + Parsed parsed; + ParseFileURL(src, src_len, &parsed); + + // Override two components, the path with something short, and the query with + // something long enough to trigger the bug. + Replacements<gurl_base::char16> repl; + gurl_base::string16 new_query; + for (int i = 0; i < 4800; i++) + new_query.push_back('a'); + + gurl_base::string16 new_path(test_utils::TruncateWStringToUTF16(L"/foo")); + repl.SetPath(new_path.c_str(), Component(0, 4)); + repl.SetQuery(new_query.c_str(), + Component(0, static_cast<int>(new_query.length()))); + + // Call ReplaceComponents on the string. It doesn't matter if we call it for + // standard URLs, file URLs, etc, since they will go to the same replacement + // function that was buggy. + Parsed repl_parsed; + std::string repl_str; + StdStringCanonOutput repl_output(&repl_str); + ReplaceFileURL(src, parsed, repl, NULL, &repl_output, &repl_parsed); + repl_output.Complete(); + + // Generate the expected string and check. + std::string expected("file:///foo?"); + for (size_t i = 0; i < new_query.length(); i++) + expected.push_back('a'); + EXPECT_TRUE(expected == repl_str); +} + +TEST(URLCanonTest, DefaultPortForScheme) { + struct TestCases { + const char* scheme; + const int expected_port; + } cases[]{ + {"http", 80}, + {"https", 443}, + {"ftp", 21}, + {"ws", 80}, + {"wss", 443}, + {"gopher", 70}, + {"fake-scheme", PORT_UNSPECIFIED}, + {"HTTP", PORT_UNSPECIFIED}, + {"HTTPS", PORT_UNSPECIFIED}, + {"FTP", PORT_UNSPECIFIED}, + {"WS", PORT_UNSPECIFIED}, + {"WSS", PORT_UNSPECIFIED}, + {"GOPHER", PORT_UNSPECIFIED}, + }; + + for (auto& test_case : cases) { + SCOPED_TRACE(test_case.scheme); + EXPECT_EQ(test_case.expected_port, + DefaultPortForScheme(test_case.scheme, strlen(test_case.scheme))); + } +} + +TEST(URLCanonTest, IDNToASCII) { + RawCanonOutputW<1024> output; + + // Basic ASCII test. + gurl_base::string16 str = gurl_base::UTF8ToUTF16("hello"); + EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output)); + EXPECT_EQ(gurl_base::UTF8ToUTF16("hello"), gurl_base::string16(output.data())); + output.set_length(0); + + // Mixed ASCII/non-ASCII. + str = gurl_base::UTF8ToUTF16("hellö"); + EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output)); + EXPECT_EQ(gurl_base::UTF8ToUTF16("xn--hell-8qa"), gurl_base::string16(output.data())); + output.set_length(0); + + // All non-ASCII. + str = gurl_base::UTF8ToUTF16("你好"); + EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output)); + EXPECT_EQ(gurl_base::UTF8ToUTF16("xn--6qq79v"), gurl_base::string16(output.data())); + output.set_length(0); + + // Characters that need mapping (the resulting Punycode is the encoding for + // "1⁄4"). + str = gurl_base::UTF8ToUTF16("¼"); + EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output)); + EXPECT_EQ(gurl_base::UTF8ToUTF16("xn--14-c6t"), gurl_base::string16(output.data())); + output.set_length(0); + + // String to encode already starts with "xn--", and all ASCII. Should not + // modify the string. + str = gurl_base::UTF8ToUTF16("xn--hell-8qa"); + EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output)); + EXPECT_EQ(gurl_base::UTF8ToUTF16("xn--hell-8qa"), gurl_base::string16(output.data())); + output.set_length(0); + + // String to encode already starts with "xn--", and mixed ASCII/non-ASCII. + // Should fail, due to a special case: if the label starts with "xn--", it + // should be parsed as Punycode, which must be all ASCII. + str = gurl_base::UTF8ToUTF16("xn--hellö"); + EXPECT_FALSE(IDNToASCII(str.data(), str.length(), &output)); + output.set_length(0); + + // String to encode already starts with "xn--", and mixed ASCII/non-ASCII. + // This tests that there is still an error for the character '⁄' (U+2044), + // which would be a valid ASCII character, U+0044, if the high byte were + // ignored. + str = gurl_base::UTF8ToUTF16("xn--1⁄4"); + EXPECT_FALSE(IDNToASCII(str.data(), str.length(), &output)); + output.set_length(0); +} + +} // namespace url
diff --git a/url/url_constants.cc b/url/url_constants.cc new file mode 100644 index 0000000..3540240 --- /dev/null +++ b/url/url_constants.cc
@@ -0,0 +1,36 @@ +// Copyright 2014 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_constants.h" + +namespace url { + +const char kAboutBlankURL[] = "about:blank"; +const char kAboutSrcdocURL[] = "about:srcdoc"; + +const char kAboutBlankPath[] = "blank"; +const char kAboutSrcdocPath[] = "srcdoc"; + +const char kAboutScheme[] = "about"; +const char kBlobScheme[] = "blob"; +const char kContentScheme[] = "content"; +const char kContentIDScheme[] = "cid"; +const char kDataScheme[] = "data"; +const char kFileScheme[] = "file"; +const char kFileSystemScheme[] = "filesystem"; +const char kFtpScheme[] = "ftp"; +const char kGopherScheme[] = "gopher"; +const char kHttpScheme[] = "http"; +const char kHttpsScheme[] = "https"; +const char kJavaScriptScheme[] = "javascript"; +const char kMailToScheme[] = "mailto"; +const char kTelScheme[] = "tel"; +const char kWsScheme[] = "ws"; +const char kWssScheme[] = "wss"; + +const char kStandardSchemeSeparator[] = "://"; + +const size_t kMaxURLChars = 2 * 1024 * 1024; + +} // namespace url
diff --git a/url/url_constants.h b/url/url_constants.h new file mode 100644 index 0000000..c077b8d --- /dev/null +++ b/url/url_constants.h
@@ -0,0 +1,45 @@ +// Copyright 2014 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_CONSTANTS_H_ +#define URL_URL_CONSTANTS_H_ + +#include <stddef.h> + +#include "polyfills/base/component_export.h" + +namespace url { + +COMPONENT_EXPORT(URL) extern const char kAboutBlankURL[]; +COMPONENT_EXPORT(URL) extern const char kAboutSrcdocURL[]; + +COMPONENT_EXPORT(URL) extern const char kAboutBlankPath[]; +COMPONENT_EXPORT(URL) extern const char kAboutSrcdocPath[]; + +COMPONENT_EXPORT(URL) extern const char kAboutScheme[]; +COMPONENT_EXPORT(URL) extern const char kBlobScheme[]; +// The content scheme is specific to Android for identifying a stored file. +COMPONENT_EXPORT(URL) extern const char kContentScheme[]; +COMPONENT_EXPORT(URL) extern const char kContentIDScheme[]; +COMPONENT_EXPORT(URL) extern const char kDataScheme[]; +COMPONENT_EXPORT(URL) extern const char kFileScheme[]; +COMPONENT_EXPORT(URL) extern const char kFileSystemScheme[]; +COMPONENT_EXPORT(URL) extern const char kFtpScheme[]; +COMPONENT_EXPORT(URL) extern const char kGopherScheme[]; +COMPONENT_EXPORT(URL) extern const char kHttpScheme[]; +COMPONENT_EXPORT(URL) extern const char kHttpsScheme[]; +COMPONENT_EXPORT(URL) extern const char kJavaScriptScheme[]; +COMPONENT_EXPORT(URL) extern const char kMailToScheme[]; +COMPONENT_EXPORT(URL) extern const char kTelScheme[]; +COMPONENT_EXPORT(URL) extern const char kWsScheme[]; +COMPONENT_EXPORT(URL) extern const char kWssScheme[]; + +// Used to separate a standard scheme and the hostname: "://". +COMPONENT_EXPORT(URL) extern const char kStandardSchemeSeparator[]; + +COMPONENT_EXPORT(URL) extern const size_t kMaxURLChars; + +} // namespace url + +#endif // URL_URL_CONSTANTS_H_
diff --git a/url/url_file.h b/url/url_file.h new file mode 100644 index 0000000..cfe047e --- /dev/null +++ b/url/url_file.h
@@ -0,0 +1,81 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_FILE_H_ +#define URL_URL_FILE_H_ + +// Provides shared functions used by the internals of the parser and +// canonicalizer for file URLs. Do not use outside of these modules. + +#include "base/strings/string_util.h" +#include "url/url_parse_internal.h" + +namespace url { + +#ifdef WIN32 + +// We allow both "c:" and "c|" as drive identifiers. +inline bool IsWindowsDriveSeparator(gurl_base::char16 ch) { + return ch == ':' || ch == '|'; +} + +#endif // WIN32 + +// Returns the index of the next slash in the input after the given index, or +// spec_len if the end of the input is reached. +template<typename CHAR> +inline int FindNextSlash(const CHAR* spec, int begin_index, int spec_len) { + int idx = begin_index; + while (idx < spec_len && !IsURLSlash(spec[idx])) + idx++; + return idx; +} + +#ifdef WIN32 + +// Returns true if the start_offset in the given spec looks like it begins a +// drive spec, for example "c:". This function explicitly handles start_offset +// values that are equal to or larger than the spec_len to simplify callers. +// +// If this returns true, the spec is guaranteed to have a valid drive letter +// plus a colon starting at |start_offset|. +template<typename CHAR> +inline bool DoesBeginWindowsDriveSpec(const CHAR* spec, int start_offset, + int spec_len) { + int remaining_len = spec_len - start_offset; + if (remaining_len < 2) + return false; // Not enough room. + if (!gurl_base::IsAsciiAlpha(spec[start_offset])) + return false; // Doesn't start with a valid drive letter. + if (!IsWindowsDriveSeparator(spec[start_offset + 1])) + return false; // Isn't followed with a drive separator. + return true; +} + +// Returns true if the start_offset in the given text looks like it begins a +// UNC path, for example "\\". This function explicitly handles start_offset +// values that are equal to or larger than the spec_len to simplify callers. +// +// When strict_slashes is set, this function will only accept backslashes as is +// standard for Windows. Otherwise, it will accept forward slashes as well +// which we use for a lot of URL handling. +template<typename CHAR> +inline bool DoesBeginUNCPath(const CHAR* text, + int start_offset, + int len, + bool strict_slashes) { + int remaining_len = len - start_offset; + if (remaining_len < 2) + return false; + + if (strict_slashes) + return text[start_offset] == '\\' && text[start_offset + 1] == '\\'; + return IsURLSlash(text[start_offset]) && IsURLSlash(text[start_offset + 1]); +} + +#endif // WIN32 + +} // namespace url + +#endif // URL_URL_FILE_H_
diff --git a/url/url_idna_icu.cc b/url/url_idna_icu.cc new file mode 100644 index 0000000..b0f91a1 --- /dev/null +++ b/url/url_idna_icu.cc
@@ -0,0 +1,108 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// ICU-based IDNA converter. + +#include <stdint.h> +#include <stdlib.h> +#include <string.h> + +#include "polyfills/base/logging.h" +#include "base/no_destructor.h" +#include <unicode/uidna.h> +#include <unicode/utypes.h> +#include "url/url_canon_icu.h" +#include "url/url_canon_internal.h" // for _itoa_s + +namespace url { + +namespace { + +// A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to +// a UTS46/IDNA 2008 handling object opened with uidna_openUTS46(). +// +// We use UTS46 with BiDiCheck to migrate from IDNA 2003 (with unassigned +// code points allowed) to IDNA 2008 with +// the backward compatibility in mind. What it does: +// +// 1. Use the up-to-date Unicode data. +// 2. Define a case folding/mapping with the up-to-date Unicode data as +// in IDNA 2003. +// 3. Use transitional mechanism for 4 deviation characters (sharp-s, +// final sigma, ZWJ and ZWNJ) for now. +// 4. Continue to allow symbols and punctuations. +// 5. Apply new BiDi check rules more permissive than the IDNA 2003 BiDI rules. +// 6. Do not apply STD3 rules +// 7. Do not allow unassigned code points. +// +// It also closely matches what IE 10 does except for the BiDi check ( +// http://goo.gl/3XBhqw ). +// See http://http://unicode.org/reports/tr46/ and references therein +// for more details. +struct UIDNAWrapper { + UIDNAWrapper() { + UErrorCode err = U_ZERO_ERROR; + // TODO(jungshik): Change options as different parties (browsers, + // registrars, search engines) converge toward a consensus. + value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err); + if (U_FAILURE(err)) { + GURL_CHECK(false) << "failed to open UTS46 data with error: " + << u_errorName(err) + << ". If you see this error message in a test environment " + << "your test environment likely lacks the required data " + << "tables for libicu. See https://crbug.com/778929."; + value = NULL; + } + } + + UIDNA* value; +}; + +} // namespace + +UIDNA* GetUIDNA() { + static gurl_base::NoDestructor<UIDNAWrapper> uidna_wrapper; + return uidna_wrapper->value; +} + +// Converts the Unicode input representing a hostname to ASCII using IDN rules. +// The output must be ASCII, but is represented as wide characters. +// +// On success, the output will be filled with the ASCII host name and it will +// return true. Unlike most other canonicalization functions, this assumes that +// the output is empty. The beginning of the host will be at offset 0, and +// the length of the output will be set to the length of the new host name. +// +// On error, this will return false. The output in this case is undefined. +// TODO(jungshik): use UTF-8/ASCII version of nameToASCII. +// Change the function signature and callers accordingly to avoid unnecessary +// conversions in our code. In addition, consider using icu::IDNA's UTF-8/ASCII +// version with StringByteSink. That way, we can avoid C wrappers and additional +// string conversion. +bool IDNToASCII(const gurl_base::char16* src, int src_len, CanonOutputW* output) { + GURL_DCHECK(output->length() == 0); // Output buffer is assumed empty. + + UIDNA* uidna = GetUIDNA(); + GURL_DCHECK(uidna != NULL); + while (true) { + UErrorCode err = U_ZERO_ERROR; + UIDNAInfo info = UIDNA_INFO_INITIALIZER; + int output_length = uidna_nameToASCII(uidna, (UChar*)src, src_len, (UChar*)output->data(), + output->capacity(), &info, &err); + if (U_SUCCESS(err) && info.errors == 0) { + output->set_length(output_length); + return true; + } + + // TODO(jungshik): Look at info.errors to handle them case-by-case basis + // if necessary. + if (err != U_BUFFER_OVERFLOW_ERROR || info.errors != 0) + return false; // Unknown error, give up. + + // Not enough room in our buffer, expand. + output->Resize(output_length); + } +} + +} // namespace url
diff --git a/url/url_parse_file.cc b/url/url_parse_file.cc new file mode 100644 index 0000000..b666d0b --- /dev/null +++ b/url/url_parse_file.cc
@@ -0,0 +1,222 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "polyfills/base/logging.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_file.h" +#include "url/url_parse_internal.h" + +// Interesting IE file:isms... +// +// INPUT OUTPUT +// ========================= ============================== +// file:/foo/bar file:///foo/bar +// The result here seems totally invalid!?!? This isn't UNC. +// +// file:/ +// file:// or any other number of slashes +// IE6 doesn't do anything at all if you click on this link. No error: +// nothing. IE6's history system seems to always color this link, so I'm +// guessing that it maps internally to the empty URL. +// +// C:\ file:///C:/ +// When on a file: URL source page, this link will work. When over HTTP, +// the file: URL will appear in the status bar but the link will not work +// (security restriction for all file URLs). +// +// file:foo/ file:foo/ (invalid?!?!?) +// file:/foo/ file:///foo/ (invalid?!?!?) +// file://foo/ file://foo/ (UNC to server "foo") +// file:///foo/ file:///foo/ (invalid, seems to be a file) +// file:////foo/ file://foo/ (UNC to server "foo") +// Any more than four slashes is also treated as UNC. +// +// file:C:/ file://C:/ +// file:/C:/ file://C:/ +// The number of slashes after "file:" don't matter if the thing following +// it looks like an absolute drive path. Also, slashes and backslashes are +// equally valid here. + +namespace url { + +namespace { + +// A subcomponent of DoInitFileURL, the input of this function should be a UNC +// path name, with the index of the first character after the slashes following +// the scheme given in |after_slashes|. This will initialize the host, path, +// query, and ref, and leave the other output components untouched +// (DoInitFileURL handles these for us). +template<typename CHAR> +void DoParseUNC(const CHAR* spec, + int after_slashes, + int spec_len, + Parsed* parsed) { + int next_slash = FindNextSlash(spec, after_slashes, spec_len); + if (next_slash == spec_len) { + // No additional slash found, as in "file://foo", treat the text as the + // host with no path (this will end up being UNC to server "foo"). + int host_len = spec_len - after_slashes; + if (host_len) + parsed->host = Component(after_slashes, host_len); + else + parsed->host.reset(); + parsed->path.reset(); + return; + } + +#ifdef WIN32 + // See if we have something that looks like a path following the first + // component. As in "file://localhost/c:/", we get "c:/" out. We want to + // treat this as a having no host but the path given. Works on Windows only. + if (DoesBeginWindowsDriveSpec(spec, next_slash + 1, spec_len)) { + parsed->host.reset(); + ParsePathInternal(spec, MakeRange(next_slash, spec_len), + &parsed->path, &parsed->query, &parsed->ref); + return; + } +#endif + + // Otherwise, everything up until that first slash we found is the host name, + // which will end up being the UNC host. For example "file://foo/bar.txt" + // will get a server name of "foo" and a path of "/bar". Later, on Windows, + // this should be treated as the filename "\\foo\bar.txt" in proper UNC + // notation. + int host_len = next_slash - after_slashes; + if (host_len) + parsed->host = MakeRange(after_slashes, next_slash); + else + parsed->host.reset(); + if (next_slash < spec_len) { + ParsePathInternal(spec, MakeRange(next_slash, spec_len), + &parsed->path, &parsed->query, &parsed->ref); + } else { + parsed->path.reset(); + } +} + +// A subcomponent of DoParseFileURL, the input should be a local file, with the +// beginning of the path indicated by the index in |path_begin|. This will +// initialize the host, path, query, and ref, and leave the other output +// components untouched (DoInitFileURL handles these for us). +template<typename CHAR> +void DoParseLocalFile(const CHAR* spec, + int path_begin, + int spec_len, + Parsed* parsed) { + parsed->host.reset(); + ParsePathInternal(spec, MakeRange(path_begin, spec_len), + &parsed->path, &parsed->query, &parsed->ref); +} + +// Backend for the external functions that operates on either char type. +// Handles cases where there is a scheme, but also when handed the first +// character following the "file:" at the beginning of the spec. If so, +// this is usually a slash, but needn't be; we allow paths like "file:c:\foo". +template<typename CHAR> +void DoParseFileURL(const CHAR* spec, int spec_len, Parsed* parsed) { + GURL_DCHECK(spec_len >= 0); + + // Get the parts we never use for file URLs out of the way. + parsed->username.reset(); + parsed->password.reset(); + parsed->port.reset(); + + // Many of the code paths don't set these, so it's convenient to just clear + // them. We'll write them in those cases we need them. + parsed->query.reset(); + parsed->ref.reset(); + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Find the scheme, if any. + int num_slashes = CountConsecutiveSlashes(spec, begin, spec_len); + int after_scheme; + int after_slashes; +#ifdef WIN32 + // See how many slashes there are. We want to handle cases like UNC but also + // "/c:/foo". This is when there is no scheme, so we can allow pages to do + // links like "c:/foo/bar" or "//foo/bar". This is also called by the + // relative URL resolver when it determines there is an absolute URL, which + // may give us input like "/c:/foo". + after_slashes = begin + num_slashes; + if (DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len)) { + // Windows path, don't try to extract the scheme (for example, "c:\foo"). + parsed->scheme.reset(); + after_scheme = after_slashes; + } else if (DoesBeginUNCPath(spec, begin, spec_len, false)) { + // Windows UNC path: don't try to extract the scheme, but keep the slashes. + parsed->scheme.reset(); + after_scheme = begin; + } else +#endif + { + // ExtractScheme doesn't understand the possibility of filenames with + // colons in them, in which case it returns the entire spec up to the + // colon as the scheme. So handle /foo.c:5 as a file but foo.c:5 as + // the foo.c: scheme. + if (!num_slashes && + ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + after_scheme = parsed->scheme.end() + 1; + } else { + // No scheme found, remember that. + parsed->scheme.reset(); + after_scheme = begin; + } + } + + // Handle empty specs ones that contain only whitespace or control chars, + // or that are just the scheme (for example "file:"). + if (after_scheme == spec_len) { + parsed->host.reset(); + parsed->path.reset(); + return; + } + + num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); + after_slashes = after_scheme + num_slashes; +#ifdef WIN32 + // Check whether the input is a drive again. We checked above for windows + // drive specs, but that's only at the very beginning to see if we have a + // scheme at all. This test will be duplicated in that case, but will + // additionally handle all cases with a real scheme such as "file:///C:/". + if (!DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len) && + num_slashes != 3) { + // Anything not beginning with a drive spec ("c:\") on Windows is treated + // as UNC, with the exception of three slashes which always means a file. + // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails. + DoParseUNC(spec, after_slashes, spec_len, parsed); + return; + } +#else + // file: URL with exactly 2 slashes is considered to have a host component. + if (num_slashes == 2) { + DoParseUNC(spec, after_slashes, spec_len, parsed); + return; + } +#endif // WIN32 + + // Easy and common case, the full path immediately follows the scheme + // (modulo slashes), as in "file://c:/foo". Just treat everything from + // there to the end as the path. Empty hosts have 0 length instead of -1. + // We include the last slash as part of the path if there is one. + DoParseLocalFile(spec, + num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme, + spec_len, parsed); +} + +} // namespace + +void ParseFileURL(const char* url, int url_len, Parsed* parsed) { + DoParseFileURL(url, url_len, parsed); +} + +void ParseFileURL(const gurl_base::char16* url, int url_len, Parsed* parsed) { + DoParseFileURL(url, url_len, parsed); +} + +} // namespace url
diff --git a/url/url_parse_internal.h b/url/url_parse_internal.h new file mode 100644 index 0000000..6f86d86 --- /dev/null +++ b/url/url_parse_internal.h
@@ -0,0 +1,91 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_PARSE_INTERNAL_H_ +#define URL_URL_PARSE_INTERNAL_H_ + +// Contains common inline helper functions used by the URL parsing routines. + +#include "url/third_party/mozilla/url_parse.h" + +namespace url { + +// We treat slashes and backslashes the same for IE compatibility. +inline bool IsURLSlash(gurl_base::char16 ch) { + return ch == '/' || ch == '\\'; +} + +// Returns true if we should trim this character from the URL because it is a +// space or a control character. +inline bool ShouldTrimFromURL(gurl_base::char16 ch) { + return ch <= ' '; +} + +// Given an already-initialized begin index and length, this shrinks the range +// to eliminate "should-be-trimmed" characters. Note that the length does *not* +// indicate the length of untrimmed data from |*begin|, but rather the position +// in the input string (so the string starts at character |*begin| in the spec, +// and goes until |*len|). +template<typename CHAR> +inline void TrimURL(const CHAR* spec, int* begin, int* len, + bool trim_path_end = true) { + // Strip leading whitespace and control characters. + while (*begin < *len && ShouldTrimFromURL(spec[*begin])) + (*begin)++; + + if (trim_path_end) { + // Strip trailing whitespace and control characters. We need the >i test + // for when the input string is all blanks; we don't want to back past the + // input. + while (*len > *begin && ShouldTrimFromURL(spec[*len - 1])) + (*len)--; + } +} + +// Counts the number of consecutive slashes starting at the given offset +// in the given string of the given length. +template<typename CHAR> +inline int CountConsecutiveSlashes(const CHAR *str, + int begin_offset, int str_len) { + int count = 0; + while (begin_offset + count < str_len && + IsURLSlash(str[begin_offset + count])) + ++count; + return count; +} + +// Internal functions in url_parse.cc that parse the path, that is, everything +// following the authority section. The input is the range of everything +// following the authority section, and the output is the identified ranges. +// +// This is designed for the file URL parser or other consumers who may do +// special stuff at the beginning, but want regular path parsing, it just +// maps to the internal parsing function for paths. +void ParsePathInternal(const char* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref); +void ParsePathInternal(const gurl_base::char16* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref); + + +// Given a spec and a pointer to the character after the colon following the +// scheme, this parses it and fills in the structure, Every item in the parsed +// structure is filled EXCEPT for the scheme, which is untouched. +void ParseAfterScheme(const char* spec, + int spec_len, + int after_scheme, + Parsed* parsed); +void ParseAfterScheme(const gurl_base::char16* spec, + int spec_len, + int after_scheme, + Parsed* parsed); + +} // namespace url + +#endif // URL_URL_PARSE_INTERNAL_H_
diff --git a/url/url_parse_perftest.cc b/url/url_parse_perftest.cc new file mode 100644 index 0000000..82c7693 --- /dev/null +++ b/url/url_parse_perftest.cc
@@ -0,0 +1,135 @@ +// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string_piece.h" +#include "base/test/perf_time_logger.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/gurl.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon.h" +#include "url/url_canon_stdstring.h" + +namespace { + +TEST(URLParse, FullURL) { + constexpr gurl_base::StringPiece kUrl = + "http://me:pass@host/foo/bar.html;param?query=yes#ref"; + + url::Parsed parsed; + gurl_base::PerfTimeLogger timer("Full_URL_Parse_AMillion"); + + for (int i = 0; i < 1000000; i++) + url::ParseStandardURL(kUrl.data(), kUrl.size(), &parsed); + timer.Done(); +} + +constexpr gurl_base::StringPiece kTypicalUrl1 = + "http://www.google.com/" + "search?q=url+parsing&ie=utf-8&oe=utf-8&aq=t&rls=org.mozilla:en-US:" + "official&client=firefox-a"; + +constexpr gurl_base::StringPiece kTypicalUrl2 = + "http://www.amazon.com/Stephen-King-Thrillers-Horror-People/dp/0766012336/" + "ref=sr_1_2/133-4144931-4505264?ie=UTF8&s=books&qid=2144880915&sr=8-2"; + +constexpr gurl_base::StringPiece kTypicalUrl3 = + "http://store.apple.com/1-800-MY-APPLE/WebObjects/AppleStore.woa/wa/" + "RSLID?nnmm=browse&mco=578E9744&node=home/desktop/mac_pro"; + +TEST(URLParse, TypicalURLParse) { + url::Parsed parsed1; + url::Parsed parsed2; + url::Parsed parsed3; + + // Do this 1/3 of a million times since we do 3 different URLs. + gurl_base::PerfTimeLogger parse_timer("Typical_URL_Parse_AMillion"); + for (int i = 0; i < 333333; i++) { + url::ParseStandardURL(kTypicalUrl1.data(), kTypicalUrl1.size(), &parsed1); + url::ParseStandardURL(kTypicalUrl2.data(), kTypicalUrl2.size(), &parsed2); + url::ParseStandardURL(kTypicalUrl3.data(), kTypicalUrl3.size(), &parsed3); + } + parse_timer.Done(); +} + +// Includes both parsing and canonicalization with no mallocs. +TEST(URLParse, TypicalURLParseCanon) { + url::Parsed parsed1; + url::Parsed parsed2; + url::Parsed parsed3; + + gurl_base::PerfTimeLogger canon_timer("Typical_Parse_Canon_AMillion"); + url::Parsed out_parsed; + url::RawCanonOutput<1024> output; + for (int i = 0; i < 333333; i++) { // divide by 3 so we get 1M + url::ParseStandardURL(kTypicalUrl1.data(), kTypicalUrl1.size(), &parsed1); + output.set_length(0); + url::CanonicalizeStandardURL( + kTypicalUrl1.data(), kTypicalUrl1.size(), parsed1, + url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output, + &out_parsed); + + url::ParseStandardURL(kTypicalUrl2.data(), kTypicalUrl2.size(), &parsed2); + output.set_length(0); + url::CanonicalizeStandardURL( + kTypicalUrl2.data(), kTypicalUrl2.size(), parsed2, + url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output, + &out_parsed); + + url::ParseStandardURL(kTypicalUrl3.data(), kTypicalUrl3.size(), &parsed3); + output.set_length(0); + url::CanonicalizeStandardURL( + kTypicalUrl3.data(), kTypicalUrl3.size(), parsed3, + url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output, + &out_parsed); + } + canon_timer.Done(); +} + +// Includes both parsing and canonicalization, and mallocs for the output. +TEST(URLParse, TypicalURLParseCanonStdString) { + url::Parsed parsed1; + url::Parsed parsed2; + url::Parsed parsed3; + + gurl_base::PerfTimeLogger canon_timer("Typical_Parse_Canon_AMillion"); + url::Parsed out_parsed; + for (int i = 0; i < 333333; i++) { // divide by 3 so we get 1M + url::ParseStandardURL(kTypicalUrl1.data(), kTypicalUrl1.size(), &parsed1); + std::string out1; + url::StdStringCanonOutput output1(&out1); + url::CanonicalizeStandardURL( + kTypicalUrl1.data(), kTypicalUrl1.size(), parsed1, + url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output1, + &out_parsed); + + url::ParseStandardURL(kTypicalUrl2.data(), kTypicalUrl2.size(), &parsed2); + std::string out2; + url::StdStringCanonOutput output2(&out2); + url::CanonicalizeStandardURL( + kTypicalUrl2.data(), kTypicalUrl2.size(), parsed2, + url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output2, + &out_parsed); + + url::ParseStandardURL(kTypicalUrl3.data(), kTypicalUrl3.size(), &parsed3); + std::string out3; + url::StdStringCanonOutput output3(&out3); + url::CanonicalizeStandardURL( + kTypicalUrl3.data(), kTypicalUrl3.size(), parsed3, + url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output3, + &out_parsed); + } + canon_timer.Done(); +} + +TEST(URLParse, GURL) { + gurl_base::PerfTimeLogger gurl_timer("Typical_GURL_AMillion"); + for (int i = 0; i < 333333; i++) { // divide by 3 so we get 1M + GURL gurl1(kTypicalUrl1); + GURL gurl2(kTypicalUrl2); + GURL gurl3(kTypicalUrl3); + } + gurl_timer.Done(); +} + +} // namespace
diff --git a/url/url_parse_unittest.cc b/url/url_parse_unittest.cc new file mode 100644 index 0000000..a1c38c2 --- /dev/null +++ b/url/url_parse_unittest.cc
@@ -0,0 +1,690 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/third_party/mozilla/url_parse.h" + +#include <stddef.h> + +#include "base/stl_util.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/third_party/mozilla/url_parse.h" + +// Interesting IE file:isms... +// +// file:/foo/bar file:///foo/bar +// The result here seems totally invalid!?!? This isn't UNC. +// +// file:/ +// file:// or any other number of slashes +// IE6 doesn't do anything at all if you click on this link. No error: +// nothing. IE6's history system seems to always color this link, so I'm +// guessing that it maps internally to the empty URL. +// +// C:\ file:///C:/ +// / file:///C:/ +// /foo file:///C:/foo +// Interestingly, IE treats "/" as an alias for "c:\", which makes sense, +// but is weird to think about on Windows. +// +// file:foo/ file:foo/ (invalid?!?!?) +// file:/foo/ file:///foo/ (invalid?!?!?) +// file://foo/ file://foo/ (UNC to server "foo") +// file:///foo/ file:///foo/ (invalid) +// file:////foo/ file://foo/ (UNC to server "foo") +// Any more than four slashes is also treated as UNC. +// +// file:C:/ file://C:/ +// file:/C:/ file://C:/ +// The number of slashes after "file:" don't matter if the thing following +// it looks like an absolute drive path. Also, slashes and backslashes are +// equally valid here. + +namespace url { +namespace { + +// Used for regular URL parse cases. +struct URLParseCase { + const char* input; + + const char* scheme; + const char* username; + const char* password; + const char* host; + int port; + const char* path; + const char* query; + const char* ref; +}; + +// Simpler version of URLParseCase for testing path URLs. +struct PathURLParseCase { + const char* input; + + const char* scheme; + const char* path; +}; + +// Simpler version of URLParseCase for testing mailto URLs. +struct MailtoURLParseCase { + const char* input; + + const char* scheme; + const char* path; + const char* query; +}; + +// More complicated version of URLParseCase for testing filesystem URLs. +struct FileSystemURLParseCase { + const char* input; + + const char* inner_scheme; + const char* inner_username; + const char* inner_password; + const char* inner_host; + int inner_port; + const char* inner_path; + const char* path; + const char* query; + const char* ref; +}; + +bool ComponentMatches(const char* input, + const char* reference, + const Component& component) { + // If the component is nonexistent (length == -1), it should begin at 0. + EXPECT_TRUE(component.len >= 0 || component.len == -1); + + // Begin should be valid. + EXPECT_LE(0, component.begin); + + // A NULL reference means the component should be nonexistent. + if (!reference) + return component.len == -1; + if (component.len < 0) + return false; // Reference is not NULL but we don't have anything + + if (strlen(reference) != static_cast<size_t>(component.len)) + return false; // Lengths don't match + + // Now check the actual characters. + return strncmp(reference, &input[component.begin], component.len) == 0; +} + +void ExpectInvalidComponent(const Component& component) { + EXPECT_EQ(0, component.begin); + EXPECT_EQ(-1, component.len); +} + +// Parsed ---------------------------------------------------------------------- + +TEST(URLParser, Length) { + const char* length_cases[] = { + // One with everything in it. + "http://user:pass@host:99/foo?bar#baz", + // One with nothing in it. + "", + // Working backwards, let's start taking off stuff from the full one. + "http://user:pass@host:99/foo?bar#", + "http://user:pass@host:99/foo?bar", + "http://user:pass@host:99/foo?", + "http://user:pass@host:99/foo", + "http://user:pass@host:99/", + "http://user:pass@host:99", + "http://user:pass@host:", + "http://user:pass@host", + "http://host", + "http://user@", + "http:", + }; + for (size_t i = 0; i < gurl_base::size(length_cases); i++) { + int true_length = static_cast<int>(strlen(length_cases[i])); + + Parsed parsed; + ParseStandardURL(length_cases[i], true_length, &parsed); + + EXPECT_EQ(true_length, parsed.Length()); + } +} + +TEST(URLParser, CountCharactersBefore) { + struct CountCase { + const char* url; + Parsed::ComponentType component; + bool include_delimiter; + int expected_count; + } count_cases[] = { + // Test each possibility in the case where all components are present. + // 0 1 2 + // 0123456789012345678901 + {"http://u:p@h:8/p?q#r", Parsed::SCHEME, true, 0}, + {"http://u:p@h:8/p?q#r", Parsed::SCHEME, false, 0}, + {"http://u:p@h:8/p?q#r", Parsed::USERNAME, true, 7}, + {"http://u:p@h:8/p?q#r", Parsed::USERNAME, false, 7}, + {"http://u:p@h:8/p?q#r", Parsed::PASSWORD, true, 9}, + {"http://u:p@h:8/p?q#r", Parsed::PASSWORD, false, 9}, + {"http://u:p@h:8/p?q#r", Parsed::HOST, true, 11}, + {"http://u:p@h:8/p?q#r", Parsed::HOST, false, 11}, + {"http://u:p@h:8/p?q#r", Parsed::PORT, true, 12}, + {"http://u:p@h:8/p?q#r", Parsed::PORT, false, 13}, + {"http://u:p@h:8/p?q#r", Parsed::PATH, false, 14}, + {"http://u:p@h:8/p?q#r", Parsed::PATH, true, 14}, + {"http://u:p@h:8/p?q#r", Parsed::QUERY, true, 16}, + {"http://u:p@h:8/p?q#r", Parsed::QUERY, false, 17}, + {"http://u:p@h:8/p?q#r", Parsed::REF, true, 18}, + {"http://u:p@h:8/p?q#r", Parsed::REF, false, 19}, + // Now test when the requested component is missing. + {"http://u:p@h:8/p?", Parsed::REF, true, 17}, + {"http://u:p@h:8/p?q", Parsed::REF, true, 18}, + {"http://u:p@h:8/p#r", Parsed::QUERY, true, 16}, + {"http://u:p@h:8#r", Parsed::PATH, true, 14}, + {"http://u:p@h/", Parsed::PORT, true, 12}, + {"http://u:p@/", Parsed::HOST, true, 11}, + // This case is a little weird. It will report that the password would + // start where the host begins. This is arguably correct, although you + // could also argue that it should start at the '@' sign. Doing it + // starting with the '@' sign is actually harder, so we don't bother. + {"http://u@h/", Parsed::PASSWORD, true, 9}, + {"http://h/", Parsed::USERNAME, true, 7}, + {"http:", Parsed::USERNAME, true, 5}, + {"", Parsed::SCHEME, true, 0}, + // Make sure a random component still works when there's nothing there. + {"", Parsed::REF, true, 0}, + // File URLs are special with no host, so we test those. + {"file:///c:/foo", Parsed::USERNAME, true, 7}, + {"file:///c:/foo", Parsed::PASSWORD, true, 7}, + {"file:///c:/foo", Parsed::HOST, true, 7}, + {"file:///c:/foo", Parsed::PATH, true, 7}, + }; + for (size_t i = 0; i < gurl_base::size(count_cases); i++) { + int length = static_cast<int>(strlen(count_cases[i].url)); + + // Simple test to distinguish file and standard URLs. + Parsed parsed; + if (length > 0 && count_cases[i].url[0] == 'f') + ParseFileURL(count_cases[i].url, length, &parsed); + else + ParseStandardURL(count_cases[i].url, length, &parsed); + + int chars_before = parsed.CountCharactersBefore( + count_cases[i].component, count_cases[i].include_delimiter); + EXPECT_EQ(count_cases[i].expected_count, chars_before); + } +} + +// Standard -------------------------------------------------------------------- + +// Input Scheme Usrname Passwd Host Port Path Query Ref +// ------------------------------------ ------- ------- ---------- ------------ --- ---------- ------------ ----- +static URLParseCase cases[] = { + // Regular URL with all the parts +{"http://user:pass@foo:21/bar;par?b#c", "http", "user", "pass", "foo", 21, "/bar;par","b", "c"}, + + // Known schemes should lean towards authority identification +{"http:foo.com", "http", NULL, NULL, "foo.com", -1, NULL, NULL, NULL}, + + // Spaces! +{"\t :foo.com \n", "", NULL, NULL, "foo.com", -1, NULL, NULL, NULL}, +{" foo.com ", NULL, NULL, NULL, "foo.com", -1, NULL, NULL, NULL}, +{"a:\t foo.com", "a", NULL, NULL, "\t foo.com", -1, NULL, NULL, NULL}, +{"http://f:21/ b ? d # e ", "http", NULL, NULL, "f", 21, "/ b ", " d ", " e"}, + + // Invalid port numbers should be identified and turned into -2, empty port + // numbers should be -1. Spaces aren't allowed in port numbers +{"http://f:/c", "http", NULL, NULL, "f", -1, "/c", NULL, NULL}, +{"http://f:0/c", "http", NULL, NULL, "f", 0, "/c", NULL, NULL}, +{"http://f:00000000000000/c", "http", NULL, NULL, "f", 0, "/c", NULL, NULL}, +{"http://f:00000000000000000000080/c", "http", NULL, NULL, "f", 80, "/c", NULL, NULL}, +{"http://f:b/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f: /c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f:\n/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f:fifty-two/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f:999999/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f: 21 / b ? d # e ", "http", NULL, NULL, "f", -2, "/ b ", " d ", " e"}, + + // Creative URLs missing key elements +{"", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{" \t", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":foo.com/", "", NULL, NULL, "foo.com", -1, "/", NULL, NULL}, +{":foo.com\\", "", NULL, NULL, "foo.com", -1, "\\", NULL, NULL}, +{":", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":a", "", NULL, NULL, "a", -1, NULL, NULL, NULL}, +{":/", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":\\", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":#", "", NULL, NULL, NULL, -1, NULL, NULL, ""}, +{"#", NULL, NULL, NULL, NULL, -1, NULL, NULL, ""}, +{"#/", NULL, NULL, NULL, NULL, -1, NULL, NULL, "/"}, +{"#\\", NULL, NULL, NULL, NULL, -1, NULL, NULL, "\\"}, +{"#;?", NULL, NULL, NULL, NULL, -1, NULL, NULL, ";?"}, +{"?", NULL, NULL, NULL, NULL, -1, NULL, "", NULL}, +{"/", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":23", "", NULL, NULL, "23", -1, NULL, NULL, NULL}, +{"/:23", "/", NULL, NULL, "23", -1, NULL, NULL, NULL}, +{"//", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"::", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"::23", "", NULL, NULL, NULL, 23, NULL, NULL, NULL}, +{"foo://", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL}, + + // Username/passwords and things that look like them +{"http://a:b@c:29/d", "http", "a", "b", "c", 29, "/d", NULL, NULL}, +{"http::@c:29", "http", "", "", "c", 29, NULL, NULL, NULL}, + // ... "]" in the password field isn't allowed, but we tolerate it here... +{"http://&a:foo(b]c@d:2/", "http", "&a", "foo(b]c", "d", 2, "/", NULL, NULL}, +{"http://::@c@d:2", "http", "", ":@c", "d", 2, NULL, NULL, NULL}, +{"http://foo.com:b@d/", "http", "foo.com", "b", "d", -1, "/", NULL, NULL}, + +{"http://foo.com/\\@", "http", NULL, NULL, "foo.com", -1, "/\\@", NULL, NULL}, +{"http:\\\\foo.com\\", "http", NULL, NULL, "foo.com", -1, "\\", NULL, NULL}, +{"http:\\\\a\\b:c\\d@foo.com\\", "http", NULL, NULL, "a", -1, "\\b:c\\d@foo.com\\", NULL, NULL}, + + // Tolerate different numbers of slashes. +{"foo:/", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"foo:/bar.com/", "foo", NULL, NULL, "bar.com", -1, "/", NULL, NULL}, +{"foo://///////", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"foo://///////bar.com/", "foo", NULL, NULL, "bar.com", -1, "/", NULL, NULL}, +{"foo:////://///", "foo", NULL, NULL, NULL, -1, "/////", NULL, NULL}, + + // Raw file paths on Windows aren't handled by the parser. +{"c:/foo", "c", NULL, NULL, "foo", -1, NULL, NULL, NULL}, +{"//foo/bar", NULL, NULL, NULL, "foo", -1, "/bar", NULL, NULL}, + + // Use the first question mark for the query and the ref. +{"http://foo/path;a??e#f#g", "http", NULL, NULL, "foo", -1, "/path;a", "?e", "f#g"}, +{"http://foo/abcd?efgh?ijkl", "http", NULL, NULL, "foo", -1, "/abcd", "efgh?ijkl", NULL}, +{"http://foo/abcd#foo?bar", "http", NULL, NULL, "foo", -1, "/abcd", NULL, "foo?bar"}, + + // IPv6, check also interesting uses of colons. +{"[61:24:74]:98", "[61", NULL, NULL, "24:74]", 98, NULL, NULL, NULL}, +{"http://[61:27]:98", "http", NULL, NULL, "[61:27]", 98, NULL, NULL, NULL}, +{"http:[61:27]/:foo", "http", NULL, NULL, "[61:27]", -1, "/:foo", NULL, NULL}, +{"http://[1::2]:3:4", "http", NULL, NULL, "[1::2]:3", 4, NULL, NULL, NULL}, + + // Partially-complete IPv6 literals, and related cases. +{"http://2001::1", "http", NULL, NULL, "2001:", 1, NULL, NULL, NULL}, +{"http://[2001::1", "http", NULL, NULL, "[2001::1", -1, NULL, NULL, NULL}, +{"http://2001::1]", "http", NULL, NULL, "2001::1]", -1, NULL, NULL, NULL}, +{"http://2001::1]:80", "http", NULL, NULL, "2001::1]", 80, NULL, NULL, NULL}, +{"http://[2001::1]", "http", NULL, NULL, "[2001::1]", -1, NULL, NULL, NULL}, +{"http://[2001::1]:80", "http", NULL, NULL, "[2001::1]", 80, NULL, NULL, NULL}, +{"http://[[::]]", "http", NULL, NULL, "[[::]]", -1, NULL, NULL, NULL}, + +}; + +TEST(URLParser, Standard) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the constructor. + Parsed parsed; + for (size_t i = 0; i < gurl_base::size(cases); i++) { + const char* url = cases[i].input; + ParseStandardURL(url, static_cast<int>(strlen(url)), &parsed); + int port = ParsePort(url, parsed.port); + + EXPECT_TRUE(ComponentMatches(url, cases[i].scheme, parsed.scheme)); + EXPECT_TRUE(ComponentMatches(url, cases[i].username, parsed.username)); + EXPECT_TRUE(ComponentMatches(url, cases[i].password, parsed.password)); + EXPECT_TRUE(ComponentMatches(url, cases[i].host, parsed.host)); + EXPECT_EQ(cases[i].port, port); + EXPECT_TRUE(ComponentMatches(url, cases[i].path, parsed.path)); + EXPECT_TRUE(ComponentMatches(url, cases[i].query, parsed.query)); + EXPECT_TRUE(ComponentMatches(url, cases[i].ref, parsed.ref)); + } +} + +// PathURL -------------------------------------------------------------------- + +// Various incarnations of path URLs. +static PathURLParseCase path_cases[] = { +{"", NULL, NULL}, +{":", "", NULL}, +{":/", "", "/"}, +{"/", NULL, "/"}, +{" This is \\interesting// \t", NULL, "This is \\interesting// \t"}, +{"about:", "about", NULL}, +{"about:blank", "about", "blank"}, +{" about: blank ", "about", " blank "}, +{"javascript :alert(\"He:/l\\l#o?foo\"); ", "javascript ", "alert(\"He:/l\\l#o?foo\"); "}, +}; + +TEST(URLParser, PathURL) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the constructor. + Parsed parsed; + for (size_t i = 0; i < gurl_base::size(path_cases); i++) { + const char* url = path_cases[i].input; + ParsePathURL(url, static_cast<int>(strlen(url)), false, &parsed); + + EXPECT_TRUE(ComponentMatches(url, path_cases[i].scheme, parsed.scheme)) + << i; + EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.GetContent())) + << i; + + // The remaining components are never used for path URLs. + ExpectInvalidComponent(parsed.username); + ExpectInvalidComponent(parsed.password); + ExpectInvalidComponent(parsed.host); + ExpectInvalidComponent(parsed.port); + } +} + +// Various incarnations of file URLs. +static URLParseCase file_cases[] = { +#ifdef WIN32 +{"file:server", "file", NULL, NULL, "server", -1, NULL, NULL, NULL}, +{" file: server \t", "file", NULL, NULL, " server",-1, NULL, NULL, NULL}, +{"FiLe:c|", "FiLe", NULL, NULL, NULL, -1, "c|", NULL, NULL}, +{"FILE:/\\\\/server/file", "FILE", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file://server/", "file", NULL, NULL, "server", -1, "/", NULL, NULL}, +{"file://localhost/c:/", "file", NULL, NULL, NULL, -1, "/c:/", NULL, NULL}, +{"file://127.0.0.1/c|\\", "file", NULL, NULL, NULL, -1, "/c|\\", NULL, NULL}, +{"file:/", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"file:", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL}, + // If there is a Windows drive letter, treat any number of slashes as the + // path part. +{"file:c:\\fo\\b", "file", NULL, NULL, NULL, -1, "c:\\fo\\b", NULL, NULL}, +{"file:/c:\\foo/bar", "file", NULL, NULL, NULL, -1, "/c:\\foo/bar",NULL, NULL}, +{"file://c:/f\\b", "file", NULL, NULL, NULL, -1, "/c:/f\\b", NULL, NULL}, +{"file:///C:/foo", "file", NULL, NULL, NULL, -1, "/C:/foo", NULL, NULL}, +{"file://///\\/\\/c:\\f\\b", "file", NULL, NULL, NULL, -1, "/c:\\f\\b", NULL, NULL}, + // If there is not a drive letter, we should treat is as UNC EXCEPT for + // three slashes, which we treat as a Unix style path. +{"file:server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file:/server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file://server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file:///server/file", "file", NULL, NULL, NULL, -1, "/server/file",NULL, NULL}, +{"file://\\server/file", "file", NULL, NULL, NULL, -1, "\\server/file",NULL, NULL}, +{"file:////server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, + // Queries and refs are valid for file URLs as well. +{"file:///C:/foo.html?#", "file", NULL, NULL, NULL, -1, "/C:/foo.html", "", ""}, +{"file:///C:/foo.html?query=yes#ref", "file", NULL, NULL, NULL, -1, "/C:/foo.html", "query=yes", "ref"}, +#else // WIN32 + // No slashes. + {"file:", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL}, + {"file:path", "file", NULL, NULL, NULL, -1, "path", NULL, NULL}, + {"file:path/", "file", NULL, NULL, NULL, -1, "path/", NULL, NULL}, + {"file:path/f.txt", "file", NULL, NULL, NULL, -1, "path/f.txt", NULL, NULL}, + // One slash. + {"file:/", "file", NULL, NULL, NULL, -1, "/", NULL, NULL}, + {"file:/path", "file", NULL, NULL, NULL, -1, "/path", NULL, NULL}, + {"file:/path/", "file", NULL, NULL, NULL, -1, "/path/", NULL, NULL}, + {"file:/path/f.txt", "file", NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL}, + // Two slashes. + {"file://", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL}, + {"file://server", "file", NULL, NULL, "server", -1, NULL, NULL, NULL}, + {"file://server/", "file", NULL, NULL, "server", -1, "/", NULL, NULL}, + {"file://server/f.txt", "file", NULL, NULL, "server", -1, "/f.txt", NULL, NULL}, + // Three slashes. + {"file:///", "file", NULL, NULL, NULL, -1, "/", NULL, NULL}, + {"file:///path", "file", NULL, NULL, NULL, -1, "/path", NULL, NULL}, + {"file:///path/", "file", NULL, NULL, NULL, -1, "/path/", NULL, NULL}, + {"file:///path/f.txt", "file", NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL}, + // More than three slashes. + {"file:////", "file", NULL, NULL, NULL, -1, "/", NULL, NULL}, + {"file:////path", "file", NULL, NULL, NULL, -1, "/path", NULL, NULL}, + {"file:////path/", "file", NULL, NULL, NULL, -1, "/path/", NULL, NULL}, + {"file:////path/f.txt", "file", NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL}, + // Schemeless URLs + {"path/f.txt", NULL, NULL, NULL, NULL, -1, "path/f.txt", NULL, NULL}, + {"path:80/f.txt", "path", NULL, NULL, NULL, -1, "80/f.txt", NULL, NULL}, + {"path/f.txt:80", "path/f.txt",NULL, NULL, NULL, -1, "80", NULL, NULL}, // Wrong. + {"/path/f.txt", NULL, NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL}, + {"/path:80/f.txt", NULL, NULL, NULL, NULL, -1, "/path:80/f.txt",NULL, NULL}, + {"/path/f.txt:80", NULL, NULL, NULL, NULL, -1, "/path/f.txt:80",NULL, NULL}, + {"//server/f.txt", NULL, NULL, NULL, "server", -1, "/f.txt", NULL, NULL}, + {"//server:80/f.txt", NULL, NULL, NULL, "server:80",-1, "/f.txt", NULL, NULL}, + {"//server/f.txt:80", NULL, NULL, NULL, "server", -1, "/f.txt:80", NULL, NULL}, + {"///path/f.txt", NULL, NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL}, + {"///path:80/f.txt", NULL, NULL, NULL, NULL, -1, "/path:80/f.txt",NULL, NULL}, + {"///path/f.txt:80", NULL, NULL, NULL, NULL, -1, "/path/f.txt:80",NULL, NULL}, + {"////path/f.txt", NULL, NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL}, + {"////path:80/f.txt", NULL, NULL, NULL, NULL, -1, "/path:80/f.txt",NULL, NULL}, + {"////path/f.txt:80", NULL, NULL, NULL, NULL, -1, "/path/f.txt:80",NULL, NULL}, + // Queries and refs are valid for file URLs as well. + {"file:///foo.html?#", "file", NULL, NULL, NULL, -1, "/foo.html", "", ""}, + {"file:///foo.html?q=y#ref", "file", NULL, NULL, NULL, -1, "/foo.html", "q=y", "ref"}, +#endif // WIN32 +}; + +TEST(URLParser, ParseFileURL) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the construtor. + Parsed parsed; + for (size_t i = 0; i < gurl_base::size(file_cases); i++) { + const char* url = file_cases[i].input; + ParseFileURL(url, static_cast<int>(strlen(url)), &parsed); + int port = ParsePort(url, parsed.port); + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].scheme, parsed.scheme)) + << " for case #" << i << " [" << url << "] " + << parsed.scheme.begin << ", " << parsed.scheme.len; + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].username, parsed.username)) + << " for case #" << i << " [" << url << "] " + << parsed.username.begin << ", " << parsed.username.len; + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].password, parsed.password)) + << " for case #" << i << " [" << url << "] " + << parsed.password.begin << ", " << parsed.password.len; + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].host, parsed.host)) + << " for case #" << i << " [" << url << "] " + << parsed.host.begin << ", " << parsed.host.len; + + EXPECT_EQ(file_cases[i].port, port) + << " for case #" << i << " [ " << url << "] " << port; + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].path, parsed.path)) + << " for case #" << i << " [" << url << "] " + << parsed.path.begin << ", " << parsed.path.len; + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].query, parsed.query)) + << " for case #" << i << " [" << url << "] " + << parsed.query.begin << ", " << parsed.query.len; + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].ref, parsed.ref)) + << " for case #" << i << " [ "<< url << "] " + << parsed.query.begin << ", " << parsed.scheme.len; + } +} + + +TEST(URLParser, ExtractFileName) { + struct FileCase { + const char* input; + const char* expected; + } file_cases[] = { + {"http://www.google.com", NULL}, + {"http://www.google.com/", ""}, + {"http://www.google.com/search", "search"}, + {"http://www.google.com/search/", ""}, + {"http://www.google.com/foo/bar.html?baz=22", "bar.html"}, + {"http://www.google.com/foo/bar.html#ref", "bar.html"}, + {"http://www.google.com/search/;param", ""}, + {"http://www.google.com/foo/bar.html;param#ref", "bar.html"}, + {"http://www.google.com/foo/bar.html;foo;param#ref", "bar.html"}, + {"http://www.google.com/foo/bar.html?query#ref", "bar.html"}, + {"http://www.google.com/foo;/bar.html", "bar.html"}, + {"http://www.google.com/foo;/", ""}, + {"http://www.google.com/foo;", "foo"}, + {"http://www.google.com/;", ""}, + {"http://www.google.com/foo;bar;html", "foo"}, + }; + + for (size_t i = 0; i < gurl_base::size(file_cases); i++) { + const char* url = file_cases[i].input; + int len = static_cast<int>(strlen(url)); + + Parsed parsed; + ParseStandardURL(url, len, &parsed); + + Component file_name; + ExtractFileName(url, parsed.path, &file_name); + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].expected, file_name)); + } +} + +// Returns true if the parameter with index |parameter| in the given URL's +// query string. The expected key can be NULL to indicate no such key index +// should exist. The parameter number is 1-based. +static bool NthParameterIs(const char* url, + int parameter, + const char* expected_key, + const char* expected_value) { + Parsed parsed; + ParseStandardURL(url, static_cast<int>(strlen(url)), &parsed); + + Component query = parsed.query; + + for (int i = 1; i <= parameter; i++) { + Component key, value; + if (!ExtractQueryKeyValue(url, &query, &key, &value)) { + if (parameter >= i && !expected_key) + return true; // Expected nonexistent key, got one. + return false; // Not enough keys. + } + + if (i == parameter) { + if (!expected_key) + return false; + + if (strncmp(&url[key.begin], expected_key, key.len) != 0) + return false; + if (strncmp(&url[value.begin], expected_value, value.len) != 0) + return false; + return true; + } + } + return expected_key == NULL; // We didn't find that many parameters. +} + +TEST(URLParser, ExtractQueryKeyValue) { + EXPECT_TRUE(NthParameterIs("http://www.google.com", 1, NULL, NULL)); + + // Basic case. + char a[] = "http://www.google.com?arg1=1&arg2=2&bar"; + EXPECT_TRUE(NthParameterIs(a, 1, "arg1", "1")); + EXPECT_TRUE(NthParameterIs(a, 2, "arg2", "2")); + EXPECT_TRUE(NthParameterIs(a, 3, "bar", "")); + EXPECT_TRUE(NthParameterIs(a, 4, NULL, NULL)); + + // Empty param at the end. + char b[] = "http://www.google.com?foo=bar&"; + EXPECT_TRUE(NthParameterIs(b, 1, "foo", "bar")); + EXPECT_TRUE(NthParameterIs(b, 2, NULL, NULL)); + + // Empty param at the beginning. + char c[] = "http://www.google.com?&foo=bar"; + EXPECT_TRUE(NthParameterIs(c, 1, "", "")); + EXPECT_TRUE(NthParameterIs(c, 2, "foo", "bar")); + EXPECT_TRUE(NthParameterIs(c, 3, NULL, NULL)); + + // Empty key with value. + char d[] = "http://www.google.com?=foo"; + EXPECT_TRUE(NthParameterIs(d, 1, "", "foo")); + EXPECT_TRUE(NthParameterIs(d, 2, NULL, NULL)); + + // Empty value with key. + char e[] = "http://www.google.com?foo="; + EXPECT_TRUE(NthParameterIs(e, 1, "foo", "")); + EXPECT_TRUE(NthParameterIs(e, 2, NULL, NULL)); + + // Empty key and values. + char f[] = "http://www.google.com?&&==&="; + EXPECT_TRUE(NthParameterIs(f, 1, "", "")); + EXPECT_TRUE(NthParameterIs(f, 2, "", "")); + EXPECT_TRUE(NthParameterIs(f, 3, "", "=")); + EXPECT_TRUE(NthParameterIs(f, 4, "", "")); + EXPECT_TRUE(NthParameterIs(f, 5, NULL, NULL)); +} + +// MailtoURL -------------------------------------------------------------------- + +static MailtoURLParseCase mailto_cases[] = { +//|input |scheme |path |query +{"mailto:foo@gmail.com", "mailto", "foo@gmail.com", NULL}, +{" mailto: to \t", "mailto", " to", NULL}, +{"mailto:addr1%2C%20addr2 ", "mailto", "addr1%2C%20addr2", NULL}, +{"Mailto:addr1, addr2 ", "Mailto", "addr1, addr2", NULL}, +{"mailto:addr1:addr2 ", "mailto", "addr1:addr2", NULL}, +{"mailto:?to=addr1,addr2", "mailto", NULL, "to=addr1,addr2"}, +{"mailto:?to=addr1%2C%20addr2", "mailto", NULL, "to=addr1%2C%20addr2"}, +{"mailto:addr1?to=addr2", "mailto", "addr1", "to=addr2"}, +{"mailto:?body=#foobar#", "mailto", NULL, "body=#foobar#",}, +{"mailto:#?body=#foobar#", "mailto", "#", "body=#foobar#"}, +}; + +TEST(URLParser, MailtoUrl) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the constructor. + Parsed parsed; + for (size_t i = 0; i < gurl_base::size(mailto_cases); ++i) { + const char* url = mailto_cases[i].input; + ParseMailtoURL(url, static_cast<int>(strlen(url)), &parsed); + int port = ParsePort(url, parsed.port); + + EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].scheme, parsed.scheme)); + EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].path, parsed.path)); + EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].query, parsed.query)); + EXPECT_EQ(PORT_UNSPECIFIED, port); + + // The remaining components are never used for mailto URLs. + ExpectInvalidComponent(parsed.username); + ExpectInvalidComponent(parsed.password); + ExpectInvalidComponent(parsed.port); + ExpectInvalidComponent(parsed.ref); + } +} + +// Various incarnations of filesystem URLs. +static FileSystemURLParseCase filesystem_cases[] = { + // Regular URL with all the parts +{"filesystem:http://user:pass@foo:21/temporary/bar;par?b#c", "http", "user", "pass", "foo", 21, "/temporary", "/bar;par", "b", "c"}, +{"filesystem:https://foo/persistent/bar;par/", "https", NULL, NULL, "foo", -1, "/persistent", "/bar;par/", NULL, NULL}, +{"filesystem:file:///persistent/bar;par/", "file", NULL, NULL, NULL, -1, "/persistent", "/bar;par/", NULL, NULL}, +{"filesystem:file:///persistent/bar;par/?query#ref", "file", NULL, NULL, NULL, -1, "/persistent", "/bar;par/", "query", "ref"}, +{"filesystem:file:///persistent", "file", NULL, NULL, NULL, -1, "/persistent", "", NULL, NULL}, +}; + +TEST(URLParser, FileSystemURL) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the constructor. + Parsed parsed; + for (size_t i = 0; i < gurl_base::size(filesystem_cases); i++) { + const FileSystemURLParseCase* parsecase = &filesystem_cases[i]; + const char* url = parsecase->input; + ParseFileSystemURL(url, static_cast<int>(strlen(url)), &parsed); + + EXPECT_TRUE(ComponentMatches(url, "filesystem", parsed.scheme)); + EXPECT_EQ(!parsecase->inner_scheme, !parsed.inner_parsed()); + // Only check the inner_parsed if there is one. + if (parsed.inner_parsed()) { + EXPECT_TRUE(ComponentMatches(url, parsecase->inner_scheme, + parsed.inner_parsed()->scheme)); + EXPECT_TRUE(ComponentMatches(url, parsecase->inner_username, + parsed.inner_parsed()->username)); + EXPECT_TRUE(ComponentMatches(url, parsecase->inner_password, + parsed.inner_parsed()->password)); + EXPECT_TRUE(ComponentMatches(url, parsecase->inner_host, + parsed.inner_parsed()->host)); + int port = ParsePort(url, parsed.inner_parsed()->port); + EXPECT_EQ(parsecase->inner_port, port); + + // The remaining components are never used for filesystem URLs. + ExpectInvalidComponent(parsed.inner_parsed()->query); + ExpectInvalidComponent(parsed.inner_parsed()->ref); + } + + EXPECT_TRUE(ComponentMatches(url, parsecase->path, parsed.path)); + EXPECT_TRUE(ComponentMatches(url, parsecase->query, parsed.query)); + EXPECT_TRUE(ComponentMatches(url, parsecase->ref, parsed.ref)); + + // The remaining components are never used for filesystem URLs. + ExpectInvalidComponent(parsed.username); + ExpectInvalidComponent(parsed.password); + ExpectInvalidComponent(parsed.host); + ExpectInvalidComponent(parsed.port); + } +} + +} // namespace +} // namespace url
diff --git a/url/url_test_utils.h b/url/url_test_utils.h new file mode 100644 index 0000000..f8d40e1 --- /dev/null +++ b/url/url_test_utils.h
@@ -0,0 +1,40 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_TEST_UTILS_H_ +#define URL_URL_TEST_UTILS_H_ + +// Convenience functions for string conversions. +// These are mostly intended for use in unit tests. + +#include <string> + +#include "base/strings/string16.h" +#include "base/strings/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/url_canon_internal.h" + +namespace url { + +namespace test_utils { + +// Converts a UTF-16 string from native wchar_t format to char16 by +// truncating the high 32 bits. This is different than the conversion function +// in base bacause it passes invalid UTF-16 characters which is important for +// test purposes. As a result, this is not meant to handle true UTF-32 encoded +// strings. +inline gurl_base::string16 TruncateWStringToUTF16(const wchar_t* src) { + gurl_base::string16 str; + int length = static_cast<int>(wcslen(src)); + for (int i = 0; i < length; ++i) { + str.push_back(static_cast<gurl_base::char16>(src[i])); + } + return str; +} + +} // namespace test_utils + +} // namespace url + +#endif // URL_URL_TEST_UTILS_H_
diff --git a/url/url_util.cc b/url/url_util.cc new file mode 100644 index 0000000..47fc499 --- /dev/null +++ b/url/url_util.cc
@@ -0,0 +1,809 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_util.h" + +#include <stddef.h> +#include <string.h> + +#include "base/debug/leak_annotations.h" +#include "polyfills/base/logging.h" +#include "base/no_destructor.h" +#include "base/stl_util.h" +#include "base/strings/string_util.h" +#include "url/url_canon_internal.h" +#include "url/url_constants.h" +#include "url/url_file.h" +#include "url/url_util_internal.h" + +namespace url { + +namespace { + +// List of currently registered schemes and associated properties. +struct SchemeRegistry { + // Standard format schemes (see header for details). + std::vector<SchemeWithType> standard_schemes = { + {kHttpsScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, + {kHttpScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, + // Yes, file URLs can have a hostname, so file URLs should be handled as + // "standard". File URLs never have a port as specified by the SchemeType + // field. Unlike other SCHEME_WITH_HOST schemes, the 'host' in a file + // URL may be empty, a behavior which is special-cased during + // canonicalization. + {kFileScheme, SCHEME_WITH_HOST}, + {kFtpScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, + {kGopherScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, + {kWssScheme, + SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, // WebSocket secure. + {kWsScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, // WebSocket. + {kFileSystemScheme, SCHEME_WITHOUT_AUTHORITY}, + }; + + // Schemes that are allowed for referrers. + std::vector<SchemeWithType> referrer_schemes = { + {kHttpsScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, + {kHttpScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, + }; + + // Schemes that do not trigger mixed content warning. + std::vector<std::string> secure_schemes = { + kHttpsScheme, + kAboutScheme, + kDataScheme, + kWssScheme, + }; + + // Schemes that normal pages cannot link to or access (i.e., with the same + // security rules as those applied to "file" URLs). + std::vector<std::string> local_schemes = { + kFileScheme, + }; + + // Schemes that cause pages loaded with them to not have access to pages + // loaded with any other URL scheme. + std::vector<std::string> no_access_schemes = { + kAboutScheme, + kJavaScriptScheme, + kDataScheme, + }; + + // Schemes that can be sent CORS requests. + std::vector<std::string> cors_enabled_schemes = { + kHttpsScheme, + kHttpScheme, + kDataScheme, + }; + + // Schemes that can be used by web to store data (local storage, etc). + std::vector<std::string> web_storage_schemes = { + kHttpsScheme, kHttpScheme, kFileScheme, kFtpScheme, kWssScheme, kWsScheme, + }; + + // Schemes that can bypass the Content-Security-Policy (CSP) checks. + std::vector<std::string> csp_bypassing_schemes = {}; + + // Schemes that are strictly empty documents, allowing them to commit + // synchronously. + std::vector<std::string> empty_document_schemes = { + kAboutScheme, + }; + + bool allow_non_standard_schemes = false; +}; + +SchemeRegistry* GetSchemeRegistry() { + static gurl_base::NoDestructor<SchemeRegistry> registry; + return registry.get(); +} + +// Pass this enum through for methods which would like to know if whitespace +// removal is necessary. +enum WhitespaceRemovalPolicy { + REMOVE_WHITESPACE, + DO_NOT_REMOVE_WHITESPACE, +}; + +// See the LockSchemeRegistries declaration in the header. +bool scheme_registries_locked = false; + +// This template converts a given character type to the corresponding +// StringPiece type. +template<typename CHAR> struct CharToStringPiece { +}; +template<> struct CharToStringPiece<char> { + typedef gurl_base::StringPiece Piece; +}; +template<> struct CharToStringPiece<gurl_base::char16> { + typedef gurl_base::StringPiece16 Piece; +}; + +// Given a string and a range inside the string, compares it to the given +// lower-case |compare_to| buffer. +template<typename CHAR> +inline bool DoCompareSchemeComponent(const CHAR* spec, + const Component& component, + const char* compare_to) { + if (!component.is_nonempty()) + return compare_to[0] == 0; // When component is empty, match empty scheme. + return gurl_base::LowerCaseEqualsASCII( + typename CharToStringPiece<CHAR>::Piece( + &spec[component.begin], component.len), + compare_to); +} + +// Returns true and sets |type| to the SchemeType of the given scheme +// identified by |scheme| within |spec| if in |schemes|. +template<typename CHAR> +bool DoIsInSchemes(const CHAR* spec, + const Component& scheme, + SchemeType* type, + const std::vector<SchemeWithType>& schemes) { + if (!scheme.is_nonempty()) + return false; // Empty or invalid schemes are non-standard. + + for (const SchemeWithType& scheme_with_type : schemes) { + if (gurl_base::LowerCaseEqualsASCII(typename CharToStringPiece<CHAR>::Piece( + &spec[scheme.begin], scheme.len), + scheme_with_type.scheme)) { + *type = scheme_with_type.type; + return true; + } + } + return false; +} + +template<typename CHAR> +bool DoIsStandard(const CHAR* spec, const Component& scheme, SchemeType* type) { + return DoIsInSchemes(spec, scheme, type, + GetSchemeRegistry()->standard_schemes); +} + + +template<typename CHAR> +bool DoFindAndCompareScheme(const CHAR* str, + int str_len, + const char* compare, + Component* found_scheme) { + // Before extracting scheme, canonicalize the URL to remove any whitespace. + // This matches the canonicalization done in DoCanonicalize function. + RawCanonOutputT<CHAR> whitespace_buffer; + int spec_len; + const CHAR* spec = + RemoveURLWhitespace(str, str_len, &whitespace_buffer, &spec_len, nullptr); + + Component our_scheme; + if (!ExtractScheme(spec, spec_len, &our_scheme)) { + // No scheme. + if (found_scheme) + *found_scheme = Component(); + return false; + } + if (found_scheme) + *found_scheme = our_scheme; + return DoCompareSchemeComponent(spec, our_scheme, compare); +} + +template <typename CHAR> +bool DoCanonicalize(const CHAR* spec, + int spec_len, + bool trim_path_end, + WhitespaceRemovalPolicy whitespace_policy, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed) { + output->ReserveSizeIfNeeded(spec_len); + + // Remove any whitespace from the middle of the relative URL if necessary. + // Possibly this will result in copying to the new buffer. + RawCanonOutputT<CHAR> whitespace_buffer; + if (whitespace_policy == REMOVE_WHITESPACE) { + spec = RemoveURLWhitespace(spec, spec_len, &whitespace_buffer, &spec_len, + &output_parsed->potentially_dangling_markup); + } + + Parsed parsed_input; +#ifdef WIN32 + // For Windows, we allow things that look like absolute Windows paths to be + // fixed up magically to file URLs. This is done for IE compatibility. For + // example, this will change "c:/foo" into a file URL rather than treating + // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt"). + // There is similar logic in url_canon_relative.cc for + // + // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which + // has no meaning as an absolute path name. This is because browsers on Mac + // & Unix don't generally do this, so there is no compatibility reason for + // doing so. + if (DoesBeginUNCPath(spec, 0, spec_len, false) || + DoesBeginWindowsDriveSpec(spec, 0, spec_len)) { + ParseFileURL(spec, spec_len, &parsed_input); + return CanonicalizeFileURL(spec, spec_len, parsed_input, charset_converter, + output, output_parsed); + } +#endif + + Component scheme; + if (!ExtractScheme(spec, spec_len, &scheme)) + return false; + + // This is the parsed version of the input URL, we have to canonicalize it + // before storing it in our object. + bool success; + SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + if (DoCompareSchemeComponent(spec, scheme, url::kFileScheme)) { + // File URLs are special. + ParseFileURL(spec, spec_len, &parsed_input); + success = CanonicalizeFileURL(spec, spec_len, parsed_input, + charset_converter, output, output_parsed); + } else if (DoCompareSchemeComponent(spec, scheme, url::kFileSystemScheme)) { + // Filesystem URLs are special. + ParseFileSystemURL(spec, spec_len, &parsed_input); + success = CanonicalizeFileSystemURL(spec, spec_len, parsed_input, + charset_converter, output, + output_parsed); + + } else if (DoIsStandard(spec, scheme, &scheme_type)) { + // All "normal" URLs. + ParseStandardURL(spec, spec_len, &parsed_input); + success = CanonicalizeStandardURL(spec, spec_len, parsed_input, scheme_type, + charset_converter, output, output_parsed); + + } else if (DoCompareSchemeComponent(spec, scheme, url::kMailToScheme)) { + // Mailto URLs are treated like standard URLs, with only a scheme, path, + // and query. + ParseMailtoURL(spec, spec_len, &parsed_input); + success = CanonicalizeMailtoURL(spec, spec_len, parsed_input, output, + output_parsed); + + } else { + // "Weird" URLs like data: and javascript:. + ParsePathURL(spec, spec_len, trim_path_end, &parsed_input); + success = CanonicalizePathURL(spec, spec_len, parsed_input, output, + output_parsed); + } + return success; +} + +template<typename CHAR> +bool DoResolveRelative(const char* base_spec, + int base_spec_len, + const Parsed& base_parsed, + const CHAR* in_relative, + int in_relative_length, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed) { + // Remove any whitespace from the middle of the relative URL, possibly + // copying to the new buffer. + RawCanonOutputT<CHAR> whitespace_buffer; + int relative_length; + const CHAR* relative = RemoveURLWhitespace( + in_relative, in_relative_length, &whitespace_buffer, &relative_length, + &output_parsed->potentially_dangling_markup); + + bool base_is_authority_based = false; + bool base_is_hierarchical = false; + if (base_spec && + base_parsed.scheme.is_nonempty()) { + int after_scheme = base_parsed.scheme.end() + 1; // Skip past the colon. + int num_slashes = CountConsecutiveSlashes(base_spec, after_scheme, + base_spec_len); + base_is_authority_based = num_slashes > 1; + base_is_hierarchical = num_slashes > 0; + } + + SchemeType unused_scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + bool standard_base_scheme = + base_parsed.scheme.is_nonempty() && + DoIsStandard(base_spec, base_parsed.scheme, &unused_scheme_type); + + bool is_relative; + Component relative_component; + if (!IsRelativeURL(base_spec, base_parsed, relative, relative_length, + (base_is_hierarchical || standard_base_scheme), + &is_relative, &relative_component)) { + // Error resolving. + return false; + } + + // Don't reserve buffer space here. Instead, reserve in DoCanonicalize and + // ReserveRelativeURL, to enable more accurate buffer sizes. + + // Pretend for a moment that |base_spec| is a standard URL. Normally + // non-standard URLs are treated as PathURLs, but if the base has an + // authority we would like to preserve it. + if (is_relative && base_is_authority_based && !standard_base_scheme) { + Parsed base_parsed_authority; + ParseStandardURL(base_spec, base_spec_len, &base_parsed_authority); + if (base_parsed_authority.host.is_nonempty()) { + RawCanonOutputT<char> temporary_output; + bool did_resolve_succeed = + ResolveRelativeURL(base_spec, base_parsed_authority, false, relative, + relative_component, charset_converter, + &temporary_output, output_parsed); + // The output_parsed is incorrect at this point (because it was built + // based on base_parsed_authority instead of base_parsed) and needs to be + // re-created. + DoCanonicalize(temporary_output.data(), temporary_output.length(), true, + REMOVE_WHITESPACE, charset_converter, output, + output_parsed); + return did_resolve_succeed; + } + } else if (is_relative) { + // Relative, resolve and canonicalize. + bool file_base_scheme = base_parsed.scheme.is_nonempty() && + DoCompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme); + return ResolveRelativeURL(base_spec, base_parsed, file_base_scheme, relative, + relative_component, charset_converter, output, + output_parsed); + } + + // Not relative, canonicalize the input. + return DoCanonicalize(relative, relative_length, true, + DO_NOT_REMOVE_WHITESPACE, charset_converter, output, + output_parsed); +} + +template<typename CHAR> +bool DoReplaceComponents(const char* spec, + int spec_len, + const Parsed& parsed, + const Replacements<CHAR>& replacements, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* out_parsed) { + // If the scheme is overridden, just do a simple string substitution and + // re-parse the whole thing. There are lots of edge cases that we really don't + // want to deal with. Like what happens if I replace "http://e:8080/foo" + // with a file. Does it become "file:///E:/8080/foo" where the port number + // becomes part of the path? Parsing that string as a file URL says "yes" + // but almost no sane rule for dealing with the components individually would + // come up with that. + // + // Why allow these crazy cases at all? Programatically, there is almost no + // case for replacing the scheme. The most common case for hitting this is + // in JS when building up a URL using the location object. In this case, the + // JS code expects the string substitution behavior: + // http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3 + if (replacements.IsSchemeOverridden()) { + // Canonicalize the new scheme so it is 8-bit and can be concatenated with + // the existing spec. + RawCanonOutput<128> scheme_replaced; + Component scheme_replaced_parsed; + CanonicalizeScheme(replacements.sources().scheme, + replacements.components().scheme, + &scheme_replaced, &scheme_replaced_parsed); + + // We can assume that the input is canonicalized, which means it always has + // a colon after the scheme (or where the scheme would be). + int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1 + : 1; + if (spec_len - spec_after_colon > 0) { + scheme_replaced.Append(&spec[spec_after_colon], + spec_len - spec_after_colon); + } + + // We now need to completely re-parse the resulting string since its meaning + // may have changed with the different scheme. + RawCanonOutput<128> recanonicalized; + Parsed recanonicalized_parsed; + DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(), true, + REMOVE_WHITESPACE, charset_converter, &recanonicalized, + &recanonicalized_parsed); + + // Recurse using the version with the scheme already replaced. This will now + // use the replacement rules for the new scheme. + // + // Warning: this code assumes that ReplaceComponents will re-check all + // components for validity. This is because we can't fail if DoCanonicalize + // failed above since theoretically the thing making it fail could be + // getting replaced here. If ReplaceComponents didn't re-check everything, + // we wouldn't know if something *not* getting replaced is a problem. + // If the scheme-specific replacers are made more intelligent so they don't + // re-check everything, we should instead re-canonicalize the whole thing + // after this call to check validity (this assumes replacing the scheme is + // much much less common than other types of replacements, like clearing the + // ref). + Replacements<CHAR> replacements_no_scheme = replacements; + replacements_no_scheme.SetScheme(NULL, Component()); + return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(), + recanonicalized_parsed, replacements_no_scheme, + charset_converter, output, out_parsed); + } + + // TODO(csharrison): We could be smarter about size to reserve if this is done + // in callers below, and the code checks to see which components are being + // replaced, and with what length. If this ends up being a hot spot it should + // be changed. + output->ReserveSizeIfNeeded(spec_len); + + // If we get here, then we know the scheme doesn't need to be replaced, so can + // just key off the scheme in the spec to know how to do the replacements. + if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileScheme)) { + return ReplaceFileURL(spec, parsed, replacements, charset_converter, output, + out_parsed); + } + if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileSystemScheme)) { + return ReplaceFileSystemURL(spec, parsed, replacements, charset_converter, + output, out_parsed); + } + SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + if (DoIsStandard(spec, parsed.scheme, &scheme_type)) { + return ReplaceStandardURL(spec, parsed, replacements, scheme_type, + charset_converter, output, out_parsed); + } + if (DoCompareSchemeComponent(spec, parsed.scheme, url::kMailToScheme)) { + return ReplaceMailtoURL(spec, parsed, replacements, output, out_parsed); + } + + // Default is a path URL. + return ReplacePathURL(spec, parsed, replacements, output, out_parsed); +} + +void DoAddScheme(const char* new_scheme, std::vector<std::string>* schemes) { + GURL_DCHECK(schemes); + // If this assert triggers, it means you've called Add*Scheme after + // LockSchemeRegistries has been called (see the header file for + // LockSchemeRegistries for more). + // + // This normally means you're trying to set up a new scheme too late in your + // application's init process. Locate where your app does this initialization + // and calls LockSchemeRegistries, and add your new scheme there. + GURL_DCHECK(!scheme_registries_locked) + << "Trying to add a scheme after the lists have been locked."; + + size_t scheme_len = strlen(new_scheme); + if (scheme_len == 0) + return; + + GURL_DCHECK_EQ(gurl_base::ToLowerASCII(new_scheme), new_scheme); + schemes->push_back(std::string(new_scheme)); +} + +void DoAddSchemeWithType(const char* new_scheme, + SchemeType type, + std::vector<SchemeWithType>* schemes) { + GURL_DCHECK(schemes); + // If this assert triggers, it means you've called Add*Scheme after + // LockSchemeRegistries has been called (see the header file for + // LockSchemeRegistries for more). + // + // This normally means you're trying to set up a new scheme too late in your + // application's init process. Locate where your app does this initialization + // and calls LockSchemeRegistries, and add your new scheme there. + GURL_DCHECK(!scheme_registries_locked) + << "Trying to add a scheme after the lists have been locked."; + + size_t scheme_len = strlen(new_scheme); + if (scheme_len == 0) + return; + + GURL_DCHECK_EQ(gurl_base::ToLowerASCII(new_scheme), new_scheme); + // Duplicate the scheme into a new buffer and add it to the list of standard + // schemes. This pointer will be leaked on shutdown. + char* dup_scheme = new char[scheme_len + 1]; + ANNOTATE_LEAKING_OBJECT_PTR(dup_scheme); + memcpy(dup_scheme, new_scheme, scheme_len + 1); + + SchemeWithType scheme_with_type; + scheme_with_type.scheme = dup_scheme; + scheme_with_type.type = type; + schemes->push_back(scheme_with_type); +} + +} // namespace + +void ResetForTests() { + *GetSchemeRegistry() = SchemeRegistry(); +} + +void EnableNonStandardSchemesForAndroidWebView() { + GetSchemeRegistry()->allow_non_standard_schemes = true; +} + +bool AllowNonStandardSchemesForAndroidWebView() { + return GetSchemeRegistry()->allow_non_standard_schemes; +} + +void AddStandardScheme(const char* new_scheme, SchemeType type) { + DoAddSchemeWithType(new_scheme, type, &GetSchemeRegistry()->standard_schemes); +} + +void AddReferrerScheme(const char* new_scheme, SchemeType type) { + DoAddSchemeWithType(new_scheme, type, &GetSchemeRegistry()->referrer_schemes); +} + +void AddSecureScheme(const char* new_scheme) { + DoAddScheme(new_scheme, &GetSchemeRegistry()->secure_schemes); +} + +const std::vector<std::string>& GetSecureSchemes() { + return GetSchemeRegistry()->secure_schemes; +} + +void AddLocalScheme(const char* new_scheme) { + DoAddScheme(new_scheme, &GetSchemeRegistry()->local_schemes); +} + +const std::vector<std::string>& GetLocalSchemes() { + return GetSchemeRegistry()->local_schemes; +} + +void AddNoAccessScheme(const char* new_scheme) { + DoAddScheme(new_scheme, &GetSchemeRegistry()->no_access_schemes); +} + +const std::vector<std::string>& GetNoAccessSchemes() { + return GetSchemeRegistry()->no_access_schemes; +} + +void AddCorsEnabledScheme(const char* new_scheme) { + DoAddScheme(new_scheme, &GetSchemeRegistry()->cors_enabled_schemes); +} + +const std::vector<std::string>& GetCorsEnabledSchemes() { + return GetSchemeRegistry()->cors_enabled_schemes; +} + +void AddWebStorageScheme(const char* new_scheme) { + DoAddScheme(new_scheme, &GetSchemeRegistry()->web_storage_schemes); +} + +const std::vector<std::string>& GetWebStorageSchemes() { + return GetSchemeRegistry()->web_storage_schemes; +} + +void AddCSPBypassingScheme(const char* new_scheme) { + DoAddScheme(new_scheme, &GetSchemeRegistry()->csp_bypassing_schemes); +} + +const std::vector<std::string>& GetCSPBypassingSchemes() { + return GetSchemeRegistry()->csp_bypassing_schemes; +} + +void AddEmptyDocumentScheme(const char* new_scheme) { + DoAddScheme(new_scheme, &GetSchemeRegistry()->empty_document_schemes); +} + +const std::vector<std::string>& GetEmptyDocumentSchemes() { + return GetSchemeRegistry()->empty_document_schemes; +} + +void LockSchemeRegistries() { + scheme_registries_locked = true; +} + +bool IsStandard(const char* spec, const Component& scheme) { + SchemeType unused_scheme_type; + return DoIsStandard(spec, scheme, &unused_scheme_type); +} + +bool GetStandardSchemeType(const char* spec, + const Component& scheme, + SchemeType* type) { + return DoIsStandard(spec, scheme, type); +} + +bool GetStandardSchemeType(const gurl_base::char16* spec, + const Component& scheme, + SchemeType* type) { + return DoIsStandard(spec, scheme, type); +} + +bool IsStandard(const gurl_base::char16* spec, const Component& scheme) { + SchemeType unused_scheme_type; + return DoIsStandard(spec, scheme, &unused_scheme_type); +} + +bool IsReferrerScheme(const char* spec, const Component& scheme) { + SchemeType unused_scheme_type; + return DoIsInSchemes(spec, scheme, &unused_scheme_type, + GetSchemeRegistry()->referrer_schemes); +} + +bool FindAndCompareScheme(const char* str, + int str_len, + const char* compare, + Component* found_scheme) { + return DoFindAndCompareScheme(str, str_len, compare, found_scheme); +} + +bool FindAndCompareScheme(const gurl_base::char16* str, + int str_len, + const char* compare, + Component* found_scheme) { + return DoFindAndCompareScheme(str, str_len, compare, found_scheme); +} + +bool DomainIs(gurl_base::StringPiece canonical_host, + gurl_base::StringPiece canonical_domain) { + if (canonical_host.empty() || canonical_domain.empty()) + return false; + + // If the host name ends with a dot but the input domain doesn't, then we + // ignore the dot in the host name. + size_t host_len = canonical_host.length(); + if (canonical_host.back() == '.' && canonical_domain.back() != '.') + --host_len; + + if (host_len < canonical_domain.length()) + return false; + + // |host_first_pos| is the start of the compared part of the host name, not + // start of the whole host name. + const char* host_first_pos = + canonical_host.data() + host_len - canonical_domain.length(); + + if (gurl_base::StringPiece(host_first_pos, canonical_domain.length()) != + canonical_domain) { + return false; + } + + // Make sure there aren't extra characters in host before the compared part; + // if the host name is longer than the input domain name, then the character + // immediately before the compared part should be a dot. For example, + // www.google.com has domain "google.com", but www.iamnotgoogle.com does not. + if (canonical_domain[0] != '.' && host_len > canonical_domain.length() && + *(host_first_pos - 1) != '.') { + return false; + } + + return true; +} + +bool HostIsIPAddress(gurl_base::StringPiece host) { + url::RawCanonOutputT<char, 128> ignored_output; + url::CanonHostInfo host_info; + url::CanonicalizeIPAddress(host.data(), Component(0, host.length()), + &ignored_output, &host_info); + return host_info.IsIPAddress(); +} + +bool Canonicalize(const char* spec, + int spec_len, + bool trim_path_end, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed) { + return DoCanonicalize(spec, spec_len, trim_path_end, REMOVE_WHITESPACE, + charset_converter, output, output_parsed); +} + +bool Canonicalize(const gurl_base::char16* spec, + int spec_len, + bool trim_path_end, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed) { + return DoCanonicalize(spec, spec_len, trim_path_end, REMOVE_WHITESPACE, + charset_converter, output, output_parsed); +} + +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const Parsed& base_parsed, + const char* relative, + int relative_length, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed) { + return DoResolveRelative(base_spec, base_spec_len, base_parsed, + relative, relative_length, + charset_converter, output, output_parsed); +} + +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const Parsed& base_parsed, + const gurl_base::char16* relative, + int relative_length, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed) { + return DoResolveRelative(base_spec, base_spec_len, base_parsed, + relative, relative_length, + charset_converter, output, output_parsed); +} + +bool ReplaceComponents(const char* spec, + int spec_len, + const Parsed& parsed, + const Replacements<char>& replacements, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* out_parsed) { + return DoReplaceComponents(spec, spec_len, parsed, replacements, + charset_converter, output, out_parsed); +} + +bool ReplaceComponents(const char* spec, + int spec_len, + const Parsed& parsed, + const Replacements<gurl_base::char16>& replacements, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* out_parsed) { + return DoReplaceComponents(spec, spec_len, parsed, replacements, + charset_converter, output, out_parsed); +} + +void DecodeURLEscapeSequences(const char* input, + int length, + DecodeURLMode mode, + CanonOutputW* output) { + RawCanonOutputT<char> unescaped_chars; + for (int i = 0; i < length; i++) { + if (input[i] == '%') { + unsigned char ch; + if (DecodeEscaped(input, &i, length, &ch)) { + unescaped_chars.push_back(ch); + } else { + // Invalid escape sequence, copy the percent literal. + unescaped_chars.push_back('%'); + } + } else { + // Regular non-escaped 8-bit character. + unescaped_chars.push_back(input[i]); + } + } + + int output_initial_length = output->length(); + // Convert that 8-bit to UTF-16. It's not clear IE does this at all to + // JavaScript URLs, but Firefox and Safari do. + for (int i = 0; i < unescaped_chars.length(); i++) { + unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i)); + if (uch < 0x80) { + // Non-UTF-8, just append directly + output->push_back(uch); + } else { + // next_ch will point to the last character of the decoded + // character. + int next_character = i; + unsigned code_point; + if (ReadUTFChar(unescaped_chars.data(), &next_character, + unescaped_chars.length(), &code_point)) { + // Valid UTF-8 character, convert to UTF-16. + AppendUTF16Value(code_point, output); + i = next_character; + } else if (mode == DecodeURLMode::kUTF8) { + GURL_DCHECK_EQ(code_point, 0xFFFDU); + AppendUTF16Value(code_point, output); + i = next_character; + } else { + // If there are any sequences that are not valid UTF-8, we + // revert |output| changes, and promote any bytes to UTF-16. We + // copy all characters from the beginning to the end of the + // identified sequence. + output->set_length(output_initial_length); + for (int j = 0; j < unescaped_chars.length(); ++j) + output->push_back(static_cast<unsigned char>(unescaped_chars.at(j))); + break; + } + } + } +} + +void EncodeURIComponent(const char* input, int length, CanonOutput* output) { + for (int i = 0; i < length; ++i) { + unsigned char c = static_cast<unsigned char>(input[i]); + if (IsComponentChar(c)) + output->push_back(c); + else + AppendEscapedChar(c, output); + } +} + +bool CompareSchemeComponent(const char* spec, + const Component& component, + const char* compare_to) { + return DoCompareSchemeComponent(spec, component, compare_to); +} + +bool CompareSchemeComponent(const gurl_base::char16* spec, + const Component& component, + const char* compare_to) { + return DoCompareSchemeComponent(spec, component, compare_to); +} + +} // namespace url
diff --git a/url/url_util.h b/url/url_util.h new file mode 100644 index 0000000..473ae5f --- /dev/null +++ b/url/url_util.h
@@ -0,0 +1,288 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_UTIL_H_ +#define URL_URL_UTIL_H_ + +#include <string> +#include <vector> + +#include "polyfills/base/component_export.h" +#include "base/strings/string16.h" +#include "base/strings/string_piece.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon.h" +#include "url/url_constants.h" + +namespace url { + +// Init ------------------------------------------------------------------------ + +// Resets all custom schemes to the default values. Not thread-safe. +COMPONENT_EXPORT(URL) void ResetForTests(); + +// Schemes --------------------------------------------------------------------- + +// Changes the behavior of SchemeHostPort / Origin to allow non-standard schemes +// to be specified, instead of canonicalizing them to an invalid SchemeHostPort +// or opaque Origin, respectively. This is used for Android WebView backwards +// compatibility, which allows the use of custom schemes: content hosted in +// Android WebView assumes that one URL with a non-standard scheme will be +// same-origin to another URL with the same non-standard scheme. +// +// Not thread-safe. +COMPONENT_EXPORT(URL) void EnableNonStandardSchemesForAndroidWebView(); + +// Whether or not SchemeHostPort and Origin allow non-standard schemes. +COMPONENT_EXPORT(URL) bool AllowNonStandardSchemesForAndroidWebView(); + +// A pair for representing a standard scheme name and the SchemeType for it. +struct COMPONENT_EXPORT(URL) SchemeWithType { + const char* scheme; + SchemeType type; +}; + +// The following Add*Scheme method are not threadsafe and can not be called +// concurrently with any other url_util function. They will assert if the lists +// of schemes have been locked (see LockSchemeRegistries). + +// Adds an application-defined scheme to the internal list of "standard-format" +// URL schemes. A standard-format scheme adheres to what RFC 3986 calls "generic +// URI syntax" (https://tools.ietf.org/html/rfc3986#section-3). + +COMPONENT_EXPORT(URL) +void AddStandardScheme(const char* new_scheme, SchemeType scheme_type); + +// Adds an application-defined scheme to the internal list of schemes allowed +// for referrers. +COMPONENT_EXPORT(URL) +void AddReferrerScheme(const char* new_scheme, SchemeType scheme_type); + +// Adds an application-defined scheme to the list of schemes that do not trigger +// mixed content warnings. +COMPONENT_EXPORT(URL) void AddSecureScheme(const char* new_scheme); +COMPONENT_EXPORT(URL) const std::vector<std::string>& GetSecureSchemes(); + +// Adds an application-defined scheme to the list of schemes that normal pages +// cannot link to or access (i.e., with the same security rules as those applied +// to "file" URLs). +COMPONENT_EXPORT(URL) void AddLocalScheme(const char* new_scheme); +COMPONENT_EXPORT(URL) const std::vector<std::string>& GetLocalSchemes(); + +// Adds an application-defined scheme to the list of schemes that cause pages +// loaded with them to not have access to pages loaded with any other URL +// scheme. +COMPONENT_EXPORT(URL) void AddNoAccessScheme(const char* new_scheme); +COMPONENT_EXPORT(URL) const std::vector<std::string>& GetNoAccessSchemes(); + +// Adds an application-defined scheme to the list of schemes that can be sent +// CORS requests. +COMPONENT_EXPORT(URL) void AddCorsEnabledScheme(const char* new_scheme); +COMPONENT_EXPORT(URL) const std::vector<std::string>& GetCorsEnabledSchemes(); + +// Adds an application-defined scheme to the list of web schemes that can be +// used by web to store data (e.g. cookies, local storage, ...). This is +// to differentiate them from schemes that can store data but are not used on +// web (e.g. application's internal schemes) or schemes that are used on web but +// cannot store data. +COMPONENT_EXPORT(URL) void AddWebStorageScheme(const char* new_scheme); +COMPONENT_EXPORT(URL) const std::vector<std::string>& GetWebStorageSchemes(); + +// Adds an application-defined scheme to the list of schemes that can bypass the +// Content-Security-Policy (CSP) checks. +COMPONENT_EXPORT(URL) void AddCSPBypassingScheme(const char* new_scheme); +COMPONENT_EXPORT(URL) const std::vector<std::string>& GetCSPBypassingSchemes(); + +// Adds an application-defined scheme to the list of schemes that are strictly +// empty documents, allowing them to commit synchronously. +COMPONENT_EXPORT(URL) void AddEmptyDocumentScheme(const char* new_scheme); +COMPONENT_EXPORT(URL) const std::vector<std::string>& GetEmptyDocumentSchemes(); + +// Sets a flag to prevent future calls to Add*Scheme from succeeding. +// +// This is designed to help prevent errors for multithreaded applications. +// Normal usage would be to call Add*Scheme for your custom schemes at +// the beginning of program initialization, and then LockSchemeRegistries. This +// prevents future callers from mistakenly calling Add*Scheme when the +// program is running with multiple threads, where such usage would be +// dangerous. +// +// We could have had Add*Scheme use a lock instead, but that would add +// some platform-specific dependencies we don't otherwise have now, and is +// overkill considering the normal usage is so simple. +COMPONENT_EXPORT(URL) void LockSchemeRegistries(); + +// Locates the scheme in the given string and places it into |found_scheme|, +// which may be NULL to indicate the caller does not care about the range. +// +// Returns whether the given |compare| scheme matches the scheme found in the +// input (if any). The |compare| scheme must be a valid canonical scheme or +// the result of the comparison is undefined. +COMPONENT_EXPORT(URL) +bool FindAndCompareScheme(const char* str, + int str_len, + const char* compare, + Component* found_scheme); +COMPONENT_EXPORT(URL) +bool FindAndCompareScheme(const gurl_base::char16* str, + int str_len, + const char* compare, + Component* found_scheme); +inline bool FindAndCompareScheme(const std::string& str, + const char* compare, + Component* found_scheme) { + return FindAndCompareScheme(str.data(), static_cast<int>(str.size()), + compare, found_scheme); +} +inline bool FindAndCompareScheme(const gurl_base::string16& str, + const char* compare, + Component* found_scheme) { + return FindAndCompareScheme(str.data(), static_cast<int>(str.size()), + compare, found_scheme); +} + +// Returns true if the given scheme identified by |scheme| within |spec| is in +// the list of known standard-format schemes (see AddStandardScheme). +COMPONENT_EXPORT(URL) +bool IsStandard(const char* spec, const Component& scheme); +COMPONENT_EXPORT(URL) +bool IsStandard(const gurl_base::char16* spec, const Component& scheme); + +// Returns true if the given scheme identified by |scheme| within |spec| is in +// the list of allowed schemes for referrers (see AddReferrerScheme). +COMPONENT_EXPORT(URL) +bool IsReferrerScheme(const char* spec, const Component& scheme); + +// Returns true and sets |type| to the SchemeType of the given scheme +// identified by |scheme| within |spec| if the scheme is in the list of known +// standard-format schemes (see AddStandardScheme). +COMPONENT_EXPORT(URL) +bool GetStandardSchemeType(const char* spec, + const Component& scheme, + SchemeType* type); +COMPONENT_EXPORT(URL) +bool GetStandardSchemeType(const gurl_base::char16* spec, + const Component& scheme, + SchemeType* type); + +// Hosts ---------------------------------------------------------------------- + +// Returns true if the |canonical_host| matches or is in the same domain as the +// given |canonical_domain| string. For example, if the canonicalized hostname +// is "www.google.com", this will return true for "com", "google.com", and +// "www.google.com" domains. +// +// If either of the input StringPieces is empty, the return value is false. The +// input domain should match host canonicalization rules. i.e. it should be +// lowercase except for escape chars. +COMPONENT_EXPORT(URL) +bool DomainIs(gurl_base::StringPiece canonical_host, + gurl_base::StringPiece canonical_domain); + +// Returns true if the hostname is an IP address. Note: this function isn't very +// cheap, as it must re-parse the host to verify. +COMPONENT_EXPORT(URL) bool HostIsIPAddress(gurl_base::StringPiece host); + +// URL library wrappers -------------------------------------------------------- + +// Parses the given spec according to the extracted scheme type. Normal users +// should use the URL object, although this may be useful if performance is +// critical and you don't want to do the heap allocation for the std::string. +// +// As with the Canonicalize* functions, the charset converter can +// be NULL to use UTF-8 (it will be faster in this case). +// +// Returns true if a valid URL was produced, false if not. On failure, the +// output and parsed structures will still be filled and will be consistent, +// but they will not represent a loadable URL. +COMPONENT_EXPORT(URL) +bool Canonicalize(const char* spec, + int spec_len, + bool trim_path_end, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed); +COMPONENT_EXPORT(URL) +bool Canonicalize(const gurl_base::char16* spec, + int spec_len, + bool trim_path_end, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed); + +// Resolves a potentially relative URL relative to the given parsed base URL. +// The base MUST be valid. The resulting canonical URL and parsed information +// will be placed in to the given out variables. +// +// The relative need not be relative. If we discover that it's absolute, this +// will produce a canonical version of that URL. See Canonicalize() for more +// about the charset_converter. +// +// Returns true if the output is valid, false if the input could not produce +// a valid URL. +COMPONENT_EXPORT(URL) +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const Parsed& base_parsed, + const char* relative, + int relative_length, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed); +COMPONENT_EXPORT(URL) +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const Parsed& base_parsed, + const gurl_base::char16* relative, + int relative_length, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed); + +// Replaces components in the given VALID input URL. The new canonical URL info +// is written to output and out_parsed. +// +// Returns true if the resulting URL is valid. +COMPONENT_EXPORT(URL) +bool ReplaceComponents(const char* spec, + int spec_len, + const Parsed& parsed, + const Replacements<char>& replacements, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* out_parsed); +COMPONENT_EXPORT(URL) +bool ReplaceComponents(const char* spec, + int spec_len, + const Parsed& parsed, + const Replacements<gurl_base::char16>& replacements, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* out_parsed); + +// String helper functions ----------------------------------------------------- + +enum class DecodeURLMode { + // UTF-8 decode only. Invalid byte sequences are replaced with U+FFFD. + kUTF8, + // Try UTF-8 decoding. If the input contains byte sequences invalid + // for UTF-8, apply byte to Unicode mapping. + kUTF8OrIsomorphic, +}; + +// Unescapes the given string using URL escaping rules. +COMPONENT_EXPORT(URL) +void DecodeURLEscapeSequences(const char* input, + int length, + DecodeURLMode mode, + CanonOutputW* output); + +// Escapes the given string as defined by the JS method encodeURIComponent. See +// https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/encodeURIComponent +COMPONENT_EXPORT(URL) +void EncodeURIComponent(const char* input, int length, CanonOutput* output); + +} // namespace url + +#endif // URL_URL_UTIL_H_
diff --git a/url/url_util_internal.h b/url/url_util_internal.h new file mode 100644 index 0000000..08f8929 --- /dev/null +++ b/url/url_util_internal.h
@@ -0,0 +1,26 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_UTIL_INTERNAL_H_ +#define URL_URL_UTIL_INTERNAL_H_ + +#include <string> + +#include "base/strings/string16.h" +#include "url/third_party/mozilla/url_parse.h" + +namespace url { + +// Given a string and a range inside the string, compares it to the given +// lower-case |compare_to| buffer. +bool CompareSchemeComponent(const char* spec, + const Component& component, + const char* compare_to); +bool CompareSchemeComponent(const gurl_base::char16* spec, + const Component& component, + const char* compare_to); + +} // namespace url + +#endif // URL_URL_UTIL_INTERNAL_H_
diff --git a/url/url_util_unittest.cc b/url/url_util_unittest.cc new file mode 100644 index 0000000..741c1dc --- /dev/null +++ b/url/url_util_unittest.cc
@@ -0,0 +1,527 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <stddef.h> + +#include "base/stl_util.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon.h" +#include "url/url_canon_stdstring.h" +#include "url/url_test_utils.h" +#include "url/url_util.h" + +namespace url { + +class URLUtilTest : public testing::Test { + public: + URLUtilTest() = default; + ~URLUtilTest() override { + // Reset any added schemes. + ResetForTests(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(URLUtilTest); +}; + +TEST_F(URLUtilTest, FindAndCompareScheme) { + Component found_scheme; + + // Simple case where the scheme is found and matches. + const char kStr1[] = "http://www.com/"; + EXPECT_TRUE(FindAndCompareScheme( + kStr1, static_cast<int>(strlen(kStr1)), "http", NULL)); + EXPECT_TRUE(FindAndCompareScheme( + kStr1, static_cast<int>(strlen(kStr1)), "http", &found_scheme)); + EXPECT_TRUE(found_scheme == Component(0, 4)); + + // A case where the scheme is found and doesn't match. + EXPECT_FALSE(FindAndCompareScheme( + kStr1, static_cast<int>(strlen(kStr1)), "https", &found_scheme)); + EXPECT_TRUE(found_scheme == Component(0, 4)); + + // A case where there is no scheme. + const char kStr2[] = "httpfoobar"; + EXPECT_FALSE(FindAndCompareScheme( + kStr2, static_cast<int>(strlen(kStr2)), "http", &found_scheme)); + EXPECT_TRUE(found_scheme == Component()); + + // When there is an empty scheme, it should match the empty scheme. + const char kStr3[] = ":foo.com/"; + EXPECT_TRUE(FindAndCompareScheme( + kStr3, static_cast<int>(strlen(kStr3)), "", &found_scheme)); + EXPECT_TRUE(found_scheme == Component(0, 0)); + + // But when there is no scheme, it should fail. + EXPECT_FALSE(FindAndCompareScheme("", 0, "", &found_scheme)); + EXPECT_TRUE(found_scheme == Component()); + + // When there is a whitespace char in scheme, it should canonicalize the URL + // before comparison. + const char whtspc_str[] = " \r\n\tjav\ra\nscri\tpt:alert(1)"; + EXPECT_TRUE(FindAndCompareScheme(whtspc_str, + static_cast<int>(strlen(whtspc_str)), + "javascript", &found_scheme)); + EXPECT_TRUE(found_scheme == Component(1, 10)); + + // Control characters should be stripped out on the ends, and kept in the + // middle. + const char ctrl_str[] = "\02jav\02scr\03ipt:alert(1)"; + EXPECT_FALSE(FindAndCompareScheme(ctrl_str, + static_cast<int>(strlen(ctrl_str)), + "javascript", &found_scheme)); + EXPECT_TRUE(found_scheme == Component(1, 11)); +} + +TEST_F(URLUtilTest, IsStandard) { + const char kHTTPScheme[] = "http"; + EXPECT_TRUE(IsStandard(kHTTPScheme, Component(0, strlen(kHTTPScheme)))); + + const char kFooScheme[] = "foo"; + EXPECT_FALSE(IsStandard(kFooScheme, Component(0, strlen(kFooScheme)))); +} + +TEST_F(URLUtilTest, IsReferrerScheme) { + const char kHTTPScheme[] = "http"; + EXPECT_TRUE(IsReferrerScheme(kHTTPScheme, Component(0, strlen(kHTTPScheme)))); + + const char kFooScheme[] = "foo"; + EXPECT_FALSE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme)))); +} + +TEST_F(URLUtilTest, AddReferrerScheme) { + const char kFooScheme[] = "foo"; + EXPECT_FALSE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme)))); + + AddReferrerScheme(kFooScheme, url::SCHEME_WITH_HOST); + EXPECT_TRUE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme)))); +} + +TEST_F(URLUtilTest, ShutdownCleansUpSchemes) { + const char kFooScheme[] = "foo"; + EXPECT_FALSE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme)))); + + AddReferrerScheme(kFooScheme, url::SCHEME_WITH_HOST); + EXPECT_TRUE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme)))); + + ResetForTests(); + EXPECT_FALSE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme)))); +} + +TEST_F(URLUtilTest, GetStandardSchemeType) { + url::SchemeType scheme_type; + + const char kHTTPScheme[] = "http"; + scheme_type = url::SCHEME_WITHOUT_AUTHORITY; + EXPECT_TRUE(GetStandardSchemeType(kHTTPScheme, + Component(0, strlen(kHTTPScheme)), + &scheme_type)); + EXPECT_EQ(url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, scheme_type); + + const char kFilesystemScheme[] = "filesystem"; + scheme_type = url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + EXPECT_TRUE(GetStandardSchemeType(kFilesystemScheme, + Component(0, strlen(kFilesystemScheme)), + &scheme_type)); + EXPECT_EQ(url::SCHEME_WITHOUT_AUTHORITY, scheme_type); + + const char kFooScheme[] = "foo"; + scheme_type = url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + EXPECT_FALSE(GetStandardSchemeType(kFooScheme, + Component(0, strlen(kFooScheme)), + &scheme_type)); +} + +TEST_F(URLUtilTest, ReplaceComponents) { + Parsed parsed; + RawCanonOutputT<char> output; + Parsed new_parsed; + + // Check that the following calls do not cause crash + Replacements<char> replacements; + replacements.SetRef("test", Component(0, 4)); + ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, &new_parsed); + ReplaceComponents("", 0, parsed, replacements, NULL, &output, &new_parsed); + replacements.ClearRef(); + replacements.SetHost("test", Component(0, 4)); + ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, &new_parsed); + ReplaceComponents("", 0, parsed, replacements, NULL, &output, &new_parsed); + + replacements.ClearHost(); + ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, &new_parsed); + ReplaceComponents("", 0, parsed, replacements, NULL, &output, &new_parsed); + ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, &new_parsed); + ReplaceComponents("", 0, parsed, replacements, NULL, &output, &new_parsed); +} + +static std::string CheckReplaceScheme(const char* base_url, + const char* scheme) { + // Make sure the input is canonicalized. + RawCanonOutput<32> original; + Parsed original_parsed; + Canonicalize(base_url, strlen(base_url), true, NULL, &original, + &original_parsed); + + Replacements<char> replacements; + replacements.SetScheme(scheme, Component(0, strlen(scheme))); + + std::string output_string; + StdStringCanonOutput output(&output_string); + Parsed output_parsed; + ReplaceComponents(original.data(), original.length(), original_parsed, + replacements, NULL, &output, &output_parsed); + + output.Complete(); + return output_string; +} + +TEST_F(URLUtilTest, ReplaceScheme) { + EXPECT_EQ("https://google.com/", + CheckReplaceScheme("http://google.com/", "https")); + EXPECT_EQ("file://google.com/", + CheckReplaceScheme("http://google.com/", "file")); + EXPECT_EQ("http://home/Build", + CheckReplaceScheme("file:///Home/Build", "http")); + EXPECT_EQ("javascript:foo", + CheckReplaceScheme("about:foo", "javascript")); + EXPECT_EQ("://google.com/", + CheckReplaceScheme("http://google.com/", "")); + EXPECT_EQ("http://google.com/", + CheckReplaceScheme("about:google.com", "http")); + EXPECT_EQ("http:", CheckReplaceScheme("", "http")); + +#ifdef WIN32 + // Magic Windows drive letter behavior when converting to a file URL. + EXPECT_EQ("file:///E:/foo/", + CheckReplaceScheme("http://localhost/e:foo/", "file")); +#endif + + // This will probably change to "about://google.com/" when we fix + // http://crbug.com/160 which should also be an acceptable result. + EXPECT_EQ("about://google.com/", + CheckReplaceScheme("http://google.com/", "about")); + + EXPECT_EQ("http://example.com/%20hello%20#%20world", + CheckReplaceScheme("myscheme:example.com/ hello # world ", "http")); +} + +TEST_F(URLUtilTest, DecodeURLEscapeSequences) { + struct DecodeCase { + const char* input; + const char* output; + } decode_cases[] = { + {"hello, world", "hello, world"}, + {"%01%02%03%04%05%06%07%08%09%0a%0B%0C%0D%0e%0f/", + "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0B\x0C\x0D\x0e\x0f/"}, + {"%10%11%12%13%14%15%16%17%18%19%1a%1B%1C%1D%1e%1f/", + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1B\x1C\x1D\x1e\x1f/"}, + {"%20%21%22%23%24%25%26%27%28%29%2a%2B%2C%2D%2e%2f/", + " !\"#$%&'()*+,-.//"}, + {"%30%31%32%33%34%35%36%37%38%39%3a%3B%3C%3D%3e%3f/", + "0123456789:;<=>?/"}, + {"%40%41%42%43%44%45%46%47%48%49%4a%4B%4C%4D%4e%4f/", + "@ABCDEFGHIJKLMNO/"}, + {"%50%51%52%53%54%55%56%57%58%59%5a%5B%5C%5D%5e%5f/", + "PQRSTUVWXYZ[\\]^_/"}, + {"%60%61%62%63%64%65%66%67%68%69%6a%6B%6C%6D%6e%6f/", + "`abcdefghijklmno/"}, + {"%70%71%72%73%74%75%76%77%78%79%7a%7B%7C%7D%7e%7f/", + "pqrstuvwxyz{|}~\x7f/"}, + {"%e4%bd%a0%e5%a5%bd", "\xe4\xbd\xa0\xe5\xa5\xbd"}, + }; + + for (size_t i = 0; i < gurl_base::size(decode_cases); i++) { + const char* input = decode_cases[i].input; + RawCanonOutputT<gurl_base::char16> output; + DecodeURLEscapeSequences(input, strlen(input), + DecodeURLMode::kUTF8OrIsomorphic, &output); + EXPECT_EQ(decode_cases[i].output, + gurl_base::UTF16ToUTF8(gurl_base::string16(output.data(), + output.length()))); + + RawCanonOutputT<gurl_base::char16> output_utf8; + DecodeURLEscapeSequences(input, strlen(input), DecodeURLMode::kUTF8, + &output_utf8); + EXPECT_EQ(decode_cases[i].output, + gurl_base::UTF16ToUTF8( + gurl_base::string16(output_utf8.data(), output_utf8.length()))); + } + + // Our decode should decode %00 + const char zero_input[] = "%00"; + RawCanonOutputT<gurl_base::char16> zero_output; + DecodeURLEscapeSequences(zero_input, strlen(zero_input), DecodeURLMode::kUTF8, + &zero_output); + EXPECT_NE("%00", gurl_base::UTF16ToUTF8( + gurl_base::string16(zero_output.data(), zero_output.length()))); + + // Test the error behavior for invalid UTF-8. + struct Utf8DecodeCase { + const char* input; + std::vector<gurl_base::char16> expected_iso; + std::vector<gurl_base::char16> expected_utf8; + } utf8_decode_cases[] = { + // %e5%a5%bd is a valid UTF-8 sequence. U+597D + {"%e4%a0%e5%a5%bd", + {0x00e4, 0x00a0, 0x00e5, 0x00a5, 0x00bd, 0}, + {0xfffd, 0x597d, 0}}, + {"%e5%a5%bd%e4%a0", + {0x00e5, 0x00a5, 0x00bd, 0x00e4, 0x00a0, 0}, + {0x597d, 0xfffd, 0}}, + {"%e4%a0%e5%bd", + {0x00e4, 0x00a0, 0x00e5, 0x00bd, 0}, + {0xfffd, 0xfffd, 0}}, + }; + + for (const auto& test : utf8_decode_cases) { + const char* input = test.input; + RawCanonOutputT<gurl_base::char16> output_iso; + DecodeURLEscapeSequences(input, strlen(input), + DecodeURLMode::kUTF8OrIsomorphic, &output_iso); + EXPECT_EQ(gurl_base::string16(test.expected_iso.data()), + gurl_base::string16(output_iso.data(), output_iso.length())); + + RawCanonOutputT<gurl_base::char16> output_utf8; + DecodeURLEscapeSequences(input, strlen(input), DecodeURLMode::kUTF8, + &output_utf8); + EXPECT_EQ(gurl_base::string16(test.expected_utf8.data()), + gurl_base::string16(output_utf8.data(), output_utf8.length())); + } +} + +TEST_F(URLUtilTest, TestEncodeURIComponent) { + struct EncodeCase { + const char* input; + const char* output; + } encode_cases[] = { + {"hello, world", "hello%2C%20world"}, + {"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F", + "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F"}, + {"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F", + "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F"}, + {" !\"#$%&'()*+,-./", + "%20!%22%23%24%25%26%27()*%2B%2C-.%2F"}, + {"0123456789:;<=>?", + "0123456789%3A%3B%3C%3D%3E%3F"}, + {"@ABCDEFGHIJKLMNO", + "%40ABCDEFGHIJKLMNO"}, + {"PQRSTUVWXYZ[\\]^_", + "PQRSTUVWXYZ%5B%5C%5D%5E_"}, + {"`abcdefghijklmno", + "%60abcdefghijklmno"}, + {"pqrstuvwxyz{|}~\x7f", + "pqrstuvwxyz%7B%7C%7D~%7F"}, + }; + + for (size_t i = 0; i < gurl_base::size(encode_cases); i++) { + const char* input = encode_cases[i].input; + RawCanonOutputT<char> buffer; + EncodeURIComponent(input, strlen(input), &buffer); + std::string output(buffer.data(), buffer.length()); + EXPECT_EQ(encode_cases[i].output, output); + } +} + +TEST_F(URLUtilTest, TestResolveRelativeWithNonStandardBase) { + // This tests non-standard (in the sense that IsStandard() == false) + // hierarchical schemes. + struct ResolveRelativeCase { + const char* base; + const char* rel; + bool is_valid; + const char* out; + } resolve_non_standard_cases[] = { + // Resolving a relative path against a non-hierarchical URL should fail. + {"scheme:opaque_data", "/path", false, ""}, + // Resolving a relative path against a non-standard authority-based base + // URL doesn't alter the authority section. + {"scheme://Authority/", "../path", true, "scheme://Authority/path"}, + // A non-standard hierarchical base is resolved with path URL + // canonicalization rules. + {"data:/Blah:Blah/", "file.html", true, "data:/Blah:Blah/file.html"}, + {"data:/Path/../part/part2", "file.html", true, + "data:/Path/../part/file.html"}, + {"data://text/html,payload", "//user:pass@host:33////payload22", true, + "data://user:pass@host:33////payload22"}, + // Path URL canonicalization rules also apply to non-standard authority- + // based URLs. + {"custom://Authority/", "file.html", true, + "custom://Authority/file.html"}, + {"custom://Authority/", "other://Auth/", true, "other://Auth/"}, + {"custom://Authority/", "../../file.html", true, + "custom://Authority/file.html"}, + {"custom://Authority/path/", "file.html", true, + "custom://Authority/path/file.html"}, + {"custom://Authority:NoCanon/path/", "file.html", true, + "custom://Authority:NoCanon/path/file.html"}, + // It's still possible to get an invalid path URL. + {"custom://Invalid:!#Auth/", "file.html", false, ""}, + // A path with an authority section gets canonicalized under standard URL + // rules, even though the base was non-standard. + {"content://content.Provider/", "//other.Provider", true, + "content://other.provider/"}, + + // Resolving an absolute URL doesn't cause canonicalization of the + // result. + {"about:blank", "custom://Authority", true, "custom://Authority"}, + // Fragment URLs can be resolved against a non-standard base. + {"scheme://Authority/path", "#fragment", true, + "scheme://Authority/path#fragment"}, + {"scheme://Authority/", "#fragment", true, + "scheme://Authority/#fragment"}, + // Resolving should fail if the base URL is authority-based but is + // missing a path component (the '/' at the end). + {"scheme://Authority", "path", false, ""}, + // Test resolving a fragment (only) against any kind of base-URL. + {"about:blank", "#id42", true, "about:blank#id42"}, + {"about:blank", " #id42", true, "about:blank#id42"}, + {"about:blank#oldfrag", "#newfrag", true, "about:blank#newfrag"}, + // A surprising side effect of allowing fragments to resolve against + // any URL scheme is we might break javascript: URLs by doing so... + {"javascript:alert('foo#bar')", "#badfrag", true, + "javascript:alert('foo#badfrag"}, + // In this case, the backslashes will not be canonicalized because it's a + // non-standard URL, but they will be treated as a path separators, + // giving the base URL here a path of "\". + // + // The result here is somewhat arbitrary. One could argue it should be + // either "aaa://a\" or "aaa://a/" since the path is being replaced with + // the "current directory". But in the context of resolving on data URLs, + // adding the requested dot doesn't seem wrong either. + {"aaa://a\\", "aaa:.", true, "aaa://a\\."}}; + + for (size_t i = 0; i < gurl_base::size(resolve_non_standard_cases); i++) { + const ResolveRelativeCase& test_data = resolve_non_standard_cases[i]; + Parsed base_parsed; + ParsePathURL(test_data.base, strlen(test_data.base), false, &base_parsed); + + std::string resolved; + StdStringCanonOutput output(&resolved); + Parsed resolved_parsed; + bool valid = ResolveRelative(test_data.base, strlen(test_data.base), + base_parsed, test_data.rel, + strlen(test_data.rel), NULL, &output, + &resolved_parsed); + output.Complete(); + + EXPECT_EQ(test_data.is_valid, valid) << i; + if (test_data.is_valid && valid) + EXPECT_EQ(test_data.out, resolved) << i; + } +} + +TEST_F(URLUtilTest, TestNoRefComponent) { + // The hash-mark must be ignored when mailto: scheme is parsed, + // even if the URL has a base and relative part. + const char* base = "mailto://to/"; + const char* rel = "any#body"; + + Parsed base_parsed; + ParsePathURL(base, strlen(base), false, &base_parsed); + + std::string resolved; + StdStringCanonOutput output(&resolved); + Parsed resolved_parsed; + + bool valid = ResolveRelative(base, strlen(base), + base_parsed, rel, + strlen(rel), NULL, &output, + &resolved_parsed); + EXPECT_TRUE(valid); + EXPECT_FALSE(resolved_parsed.ref.is_valid()); +} + +TEST_F(URLUtilTest, PotentiallyDanglingMarkup) { + struct ResolveRelativeCase { + const char* base; + const char* rel; + bool potentially_dangling_markup; + const char* out; + } cases[] = { + {"https://example.com/", "/path<", false, "https://example.com/path%3C"}, + {"https://example.com/", "\n/path<", true, "https://example.com/path%3C"}, + {"https://example.com/", "\r/path<", true, "https://example.com/path%3C"}, + {"https://example.com/", "\t/path<", true, "https://example.com/path%3C"}, + {"https://example.com/", "/pa\nth<", true, "https://example.com/path%3C"}, + {"https://example.com/", "/pa\rth<", true, "https://example.com/path%3C"}, + {"https://example.com/", "/pa\tth<", true, "https://example.com/path%3C"}, + {"https://example.com/", "/path\n<", true, "https://example.com/path%3C"}, + {"https://example.com/", "/path\r<", true, "https://example.com/path%3C"}, + {"https://example.com/", "/path\r<", true, "https://example.com/path%3C"}, + {"https://example.com/", "\n/<path", true, "https://example.com/%3Cpath"}, + {"https://example.com/", "\r/<path", true, "https://example.com/%3Cpath"}, + {"https://example.com/", "\t/<path", true, "https://example.com/%3Cpath"}, + {"https://example.com/", "/<pa\nth", true, "https://example.com/%3Cpath"}, + {"https://example.com/", "/<pa\rth", true, "https://example.com/%3Cpath"}, + {"https://example.com/", "/<pa\tth", true, "https://example.com/%3Cpath"}, + {"https://example.com/", "/<path\n", true, "https://example.com/%3Cpath"}, + {"https://example.com/", "/<path\r", true, "https://example.com/%3Cpath"}, + {"https://example.com/", "/<path\r", true, "https://example.com/%3Cpath"}, + }; + + for (const auto& test : cases) { + SCOPED_TRACE(::testing::Message() << test.base << ", " << test.rel); + Parsed base_parsed; + ParseStandardURL(test.base, strlen(test.base), &base_parsed); + + std::string resolved; + StdStringCanonOutput output(&resolved); + Parsed resolved_parsed; + bool valid = + ResolveRelative(test.base, strlen(test.base), base_parsed, test.rel, + strlen(test.rel), NULL, &output, &resolved_parsed); + ASSERT_TRUE(valid); + output.Complete(); + + EXPECT_EQ(test.potentially_dangling_markup, + resolved_parsed.potentially_dangling_markup); + EXPECT_EQ(test.out, resolved); + } +} + +TEST_F(URLUtilTest, TestDomainIs) { + const struct { + const char* canonicalized_host; + const char* lower_ascii_domain; + bool expected_domain_is; + } kTestCases[] = { + {"google.com", "google.com", true}, + {"www.google.com", "google.com", true}, // Subdomain is ignored. + {"www.google.com.cn", "google.com", false}, // Different TLD. + {"www.google.comm", "google.com", false}, + {"www.iamnotgoogle.com", "google.com", false}, // Different hostname. + {"www.google.com", "Google.com", false}, // The input is not lower-cased. + + // If the host ends with a dot, it matches domains with or without a dot. + {"www.google.com.", "google.com", true}, + {"www.google.com.", "google.com.", true}, + {"www.google.com.", ".com", true}, + {"www.google.com.", ".com.", true}, + + // But, if the host doesn't end with a dot and the input domain does, then + // it's considered to not match. + {"www.google.com", "google.com.", false}, + + // If the host ends with two dots, it doesn't match. + {"www.google.com..", "google.com", false}, + + // Empty parameters. + {"www.google.com", "", false}, + {"", "www.google.com", false}, + {"", "", false}, + }; + + for (const auto& test_case : kTestCases) { + SCOPED_TRACE(testing::Message() << "(host, domain): (" + << test_case.canonicalized_host << ", " + << test_case.lower_ascii_domain << ")"); + + EXPECT_EQ( + test_case.expected_domain_is, + DomainIs(test_case.canonicalized_host, test_case.lower_ascii_domain)); + } +} + +} // namespace url